wrap pandas daraframe within class - pandas

I'd like to give pandas dataframe a custom and extend it with custom methods, but still being able to use the common pandas syntax on it.
I implemented this:
import pandas as pd
class CustomDF():
def __init__(self, df: pd.DataFrame):
self.df = df
def __getattr__(self, name):
return getattr(self.df, name)
def foo(self):
print('foo')
return
the problem with the above code is that I'd like to make the following lines work:
a = CustomDF()
a.iloc[0,1]
a.foo()
but if I try to access a column of the dataframe by doing
print(a['column_name'])
I get the error "TypeError: 'CustomDF' object is not subscriptable"
Any idea on how not requiring to access .df first to get the subscription working for the super class?
Thanks!

Try inheriting from the pandas.DataFrame class. Like so:
from pandas import DataFrame
class CustomDF(DataFrame):
def foo(self):
print('foo')
return
a = CustomDF([0,1])
a.foo()

Related

implement shortcut for pandas code by extending it

Pandas has a very nice feature to save a df to clipboard. This help a lot to take the df output and analyze/inspect further in excel.
df={'A':['x','y','x','z','y'],
'B':[1,2,2,2,2],
'C':['a','b','a','d','d']}
df=pd.DataFrame(df)
df.to_clipboard(excel=True,index=False)
However I don't want to type df.to_clipboard(excel=True,index=False) each time I need to copy the df to clipboard. Is there a way I can do something like df.clip()
I tried to implement it like this but it does not work.
class ExtDF(pd.DataFrame):
def __init__(self, *args, **kwargs):
super(ExtDF, self).__init__(*args, **kwargs)
#property
def _constructor(self):
return ExtDF
def clip(self):
return self.to_clipboard(excel=True,index=False)
I would monkey patch pandas.DataFrame rather than defining a subclass:
# define a function and monkey patch pandas.DataFrame
def clipxl(self):
return self.to_clipboard(excel=True, index=False)
pd.DataFrame.clip = clipxl
# now let's try it
df={'A': ['x','y','x','z','y'],
'B': [1,2,2,2,2],
'C': ['a','b','a','d','d']}
df=pd.DataFrame(df)
df.clip()
clipboard content:
A B C
x 1 a
y 2 b
x 2 a
z 2 d
y 2 d
using a module
expand_pandas.py
import pandas as pd
# define a function and monkey patch pandas.DataFrame
def clipxl(self):
return self.to_clipboard(excel=True,index=False)
def hello(self):
return 'Hello World!'
pd.DataFrame.clip = clipxl
pd.DataFrame.hello = hello
In your environment:
import pandas as pd
import expand_pandas
df = pd.DataFrame({'A': ['x','y','x','z','y'],
'B': [1,2,2,2,2],
'C': ['a','b','a','d','d']})
df.hello()
# 'Hello World!'
df.clip()
# sends to clipboard
Instead of the object oriented approach you could write a function that does it and pass the DataFrame as a parameter.
Alternatively, instead of modifying the DataFrame class itself, you can create a class that contains your dataframe and define a method that does what you want.

How do you add dataclasses as valid index values to a plotly chart?

I am trying to switch from the matplotlib pandas plotting backend to plotly. However, I am being held back by a common occurrence of this error:
TypeError: Object of type Quarter is not JSON serializable
Where Quarter is a dataclass in my codebase.
For a minimal example, consider:
#dataclass
class Foo:
val:int
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df.plot.scatter(x='x', y='y')
As expected, the above returns:
TypeError: Object of type Foo is not JSON serializable
Now, I don't expect plotly to be magical, but adding a __float__ magic method allows the Foo objects to be used with the matplotlib backend:
# This works
#dataclass
class Foo:
val:int
def __float__(self):
return float(self.val)
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df.plot.scatter(x='x', y='y')
How can I update my dataclass to allow for it to be used with the plotly backend?
You can get pandas to cast to float before invoking plotting backend.
from dataclasses import dataclass
import pandas as pd
#dataclass
class Foo:
val:int
def __float__(self):
return float(self.val)
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df["x"].astype(float)
pd.options.plotting.backend = "plotly"
df.assign(x=lambda d: d["x"].astype(float)).plot.scatter(x='x', y='y')
monkey patching
if you don't want to change code, you can monkey patch the plotly implementation of pandas plotting API
https://pandas.pydata.org/pandas-docs/stable/development/extending.html#plotting-backends
from dataclasses import dataclass
import pandas as pd
import wrapt, json
import plotly
#wrapt.patch_function_wrapper(plotly, 'plot')
def new_plot(wrapped, instance, args, kwargs):
try:
json.dumps(args[0][kwargs["x"]])
except TypeError:
args[0][kwargs["x"]] = args[0][kwargs["x"]].astype(float)
return wrapped(*args, **kwargs)
#dataclass
class Foo:
val:int
def __float__(self):
return float(self.val)
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df["x"].astype(float)
pd.options.plotting.backend = "plotly"
df.plot.scatter(x='x', y='y')

How to convert a Pydantic model in FastAPI to a Pandas DataFrame?

I am trying to convert a Pydantic model to a Pandas DataFrame, but I am getting various errors.
Here is the code:
from typing import Optional
from fastapi import FastAPI
from pydantic import BaseModel
import pickle
import sklearn
import pandas as pd
import numpy as np
class Userdata(BaseModel):
current_res_month_dec: Optional[int] = 0
current_res_month_nov: Optional[int] = 0
async def return_recurrent_user_predictions_gb(user_data: Userdata):
empty_dataframe = pd.DataFrame([Userdata(**{
'current_res_month_dec': user_data.current_res_month_dec,
'current_res_month_nov': user_data.current_res_month_nov})], ignore_index=True)
This is the DataFrame that is returned when trying to execute it through /docs in my local environment:
Response body
Download
{
"0": {
"0": [
"current_res_month_dec",
0
]
},
"1": {
"0": [
"current_res_month_nov",
0
]
}
but if I try to use this DataFrame for a prediction:
model_has_afternoon = pickle.load(open('./models/model_gbclf_prob_current_product_has_afternoon.pickle', 'rb'))
result_afternoon = model_has_afternoon.predict_proba(empty_dataframe)[:, 1]
I get this error:
ValueError: setting an array element with a sequence.
I have tried building my own DataFrame before, and the predictions should work with a DataFrame.
You first need to convert the Pydantic model into a dictionary using Pydantic's dict() method. Note that other methods, such as Python's dict() function and .__dict__ attribute, have been found to be faster alternatives to Pydantic's dict() method (see this answer). However, since you are using a Pydantic model, it might be best to use Pydantic's dict() method, and then pass the dictionary to pandas.DataFrame() surrounded by square brackets; for example, pd.DataFrame([data.dict()]). As described in this answer, this approach can be used when you need the keys of the passed dict to be the columns and the values to be the rows. If you need to specify a different orientation, you can also use pandas.DataFrame.from_dict().
Working Example
from typing import Optional
from fastapi import FastAPI
from pydantic import BaseModel
import pandas as pd
app = FastAPI()
class Userdata(BaseModel):
col1: Optional[int] = 0
col2: Optional[int] = 0
col3: str = "foo"
#app.post('/submit')
def submit_data(data: Userdata):
df = pd.DataFrame([data.dict()])
return "Success"
More Options
As you mentioned that you would like to use the DataFrame for Machine Learning predictions, it should be noted that there are a few other options to pass the data to predict() and predict_proba() functions that do not require to create a DataFrame. These options include:
model.predict([[data.col1, data.col2, data.col3]])
and
model.predict([list(data.dict().values())])
Please have a look at this answer for more details. In case you would also need to respond back to the client with a DataFrame in JSON format, please take a look here.

error with pyspark foreachPartition() code restructuring

My code structure is as follows:
class SampleClass
def __init__(self, spark):
df = (spark.read.table("sampletable").repartition(100))
def func1(partition):
print(partition.getPartitionId)
print(partition.count())
// some operations on partition DF
df.rdd.foreachPartition(func1)
def main():
s = SampleClass()
if __name__ == "__main__":
main()
But I get an error like:
Could not serialize object: TypeError: can't pickle _thread.RLock objects
how can i fix this calling structure? I am using databricks runtime 7.3LTS and spark 3.0.
Thanks.

Python: How to use a dataframe outside the class which is built inside class?

class Builder():
def __init__(self, args, date, crv)
def listInstruments(self, crv):
df_list = pd.DataFrame(columns=['curve','instrument','quote'])
for instrument, data in self.instruments.items():
if 'BASIS' in instrument:
for........
else:
for quote in data['QUOTES']:
new_row = {'curve':crv,'instrument':instrument,'quote':quote}
df = df_list.append(new_row,ignore_index=True)
return df
Above is the code for reference. I would like to use df for further analysis outside this class. How can I print df outside this class ? Please suggest.