error with pyspark foreachPartition() code restructuring - dataframe

My code structure is as follows:
class SampleClass
def __init__(self, spark):
df = (spark.read.table("sampletable").repartition(100))
def func1(partition):
print(partition.getPartitionId)
print(partition.count())
// some operations on partition DF
df.rdd.foreachPartition(func1)
def main():
s = SampleClass()
if __name__ == "__main__":
main()
But I get an error like:
Could not serialize object: TypeError: can't pickle _thread.RLock objects
how can i fix this calling structure? I am using databricks runtime 7.3LTS and spark 3.0.
Thanks.

Related

implement shortcut for pandas code by extending it

Pandas has a very nice feature to save a df to clipboard. This help a lot to take the df output and analyze/inspect further in excel.
df={'A':['x','y','x','z','y'],
'B':[1,2,2,2,2],
'C':['a','b','a','d','d']}
df=pd.DataFrame(df)
df.to_clipboard(excel=True,index=False)
However I don't want to type df.to_clipboard(excel=True,index=False) each time I need to copy the df to clipboard. Is there a way I can do something like df.clip()
I tried to implement it like this but it does not work.
class ExtDF(pd.DataFrame):
def __init__(self, *args, **kwargs):
super(ExtDF, self).__init__(*args, **kwargs)
#property
def _constructor(self):
return ExtDF
def clip(self):
return self.to_clipboard(excel=True,index=False)
I would monkey patch pandas.DataFrame rather than defining a subclass:
# define a function and monkey patch pandas.DataFrame
def clipxl(self):
return self.to_clipboard(excel=True, index=False)
pd.DataFrame.clip = clipxl
# now let's try it
df={'A': ['x','y','x','z','y'],
'B': [1,2,2,2,2],
'C': ['a','b','a','d','d']}
df=pd.DataFrame(df)
df.clip()
clipboard content:
A B C
x 1 a
y 2 b
x 2 a
z 2 d
y 2 d
using a module
expand_pandas.py
import pandas as pd
# define a function and monkey patch pandas.DataFrame
def clipxl(self):
return self.to_clipboard(excel=True,index=False)
def hello(self):
return 'Hello World!'
pd.DataFrame.clip = clipxl
pd.DataFrame.hello = hello
In your environment:
import pandas as pd
import expand_pandas
df = pd.DataFrame({'A': ['x','y','x','z','y'],
'B': [1,2,2,2,2],
'C': ['a','b','a','d','d']})
df.hello()
# 'Hello World!'
df.clip()
# sends to clipboard
Instead of the object oriented approach you could write a function that does it and pass the DataFrame as a parameter.
Alternatively, instead of modifying the DataFrame class itself, you can create a class that contains your dataframe and define a method that does what you want.

How do you add dataclasses as valid index values to a plotly chart?

I am trying to switch from the matplotlib pandas plotting backend to plotly. However, I am being held back by a common occurrence of this error:
TypeError: Object of type Quarter is not JSON serializable
Where Quarter is a dataclass in my codebase.
For a minimal example, consider:
#dataclass
class Foo:
val:int
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df.plot.scatter(x='x', y='y')
As expected, the above returns:
TypeError: Object of type Foo is not JSON serializable
Now, I don't expect plotly to be magical, but adding a __float__ magic method allows the Foo objects to be used with the matplotlib backend:
# This works
#dataclass
class Foo:
val:int
def __float__(self):
return float(self.val)
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df.plot.scatter(x='x', y='y')
How can I update my dataclass to allow for it to be used with the plotly backend?
You can get pandas to cast to float before invoking plotting backend.
from dataclasses import dataclass
import pandas as pd
#dataclass
class Foo:
val:int
def __float__(self):
return float(self.val)
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df["x"].astype(float)
pd.options.plotting.backend = "plotly"
df.assign(x=lambda d: d["x"].astype(float)).plot.scatter(x='x', y='y')
monkey patching
if you don't want to change code, you can monkey patch the plotly implementation of pandas plotting API
https://pandas.pydata.org/pandas-docs/stable/development/extending.html#plotting-backends
from dataclasses import dataclass
import pandas as pd
import wrapt, json
import plotly
#wrapt.patch_function_wrapper(plotly, 'plot')
def new_plot(wrapped, instance, args, kwargs):
try:
json.dumps(args[0][kwargs["x"]])
except TypeError:
args[0][kwargs["x"]] = args[0][kwargs["x"]].astype(float)
return wrapped(*args, **kwargs)
#dataclass
class Foo:
val:int
def __float__(self):
return float(self.val)
df = pd.DataFrame({'x': [Foo(i) for i in range(10)], 'y':list(range(10))})
df["x"].astype(float)
pd.options.plotting.backend = "plotly"
df.plot.scatter(x='x', y='y')

Pytest - preserve a fresh copy of a file b/w tests, read from disk once

I want to read a large pandas dataframe from file, and then send it to each test that I run as a fixture. I would like a fresh version of the original df taken from memory, not read from disk.
Something like this:
import pytest
# <something here>
...
df = pd.read_csv("data.csv)
...
def test_1(df_fixture):
assert len(df_fixture) == 1_000_000
def test_2(df_fixture):
# Want a fresh df, without reading from disk again
assert sum(df_fixture["col1]) == 10_000_000
you can put your main df in fixture with scope=session and the copy in another fixture with scope=function. in the test call the fixture of the copy
import pandas as pd
import pytest
#pytest.fixture(scope="session")
def get_main_df():
yield pd.read_csv("data.csv")
#pytest.fixture
def get_df_copy(get_main_df):
yield get_main_df.copy(deep=True)
def test_1(get_df_copy):
assert len(get_df_copy) == 1_000_000
def test_2(get_df_copy):
assert sum(get_df_copy["col1"]) == 10_000_000

Python: How to use a dataframe outside the class which is built inside class?

class Builder():
def __init__(self, args, date, crv)
def listInstruments(self, crv):
df_list = pd.DataFrame(columns=['curve','instrument','quote'])
for instrument, data in self.instruments.items():
if 'BASIS' in instrument:
for........
else:
for quote in data['QUOTES']:
new_row = {'curve':crv,'instrument':instrument,'quote':quote}
df = df_list.append(new_row,ignore_index=True)
return df
Above is the code for reference. I would like to use df for further analysis outside this class. How can I print df outside this class ? Please suggest.

wrap pandas daraframe within class

I'd like to give pandas dataframe a custom and extend it with custom methods, but still being able to use the common pandas syntax on it.
I implemented this:
import pandas as pd
class CustomDF():
def __init__(self, df: pd.DataFrame):
self.df = df
def __getattr__(self, name):
return getattr(self.df, name)
def foo(self):
print('foo')
return
the problem with the above code is that I'd like to make the following lines work:
a = CustomDF()
a.iloc[0,1]
a.foo()
but if I try to access a column of the dataframe by doing
print(a['column_name'])
I get the error "TypeError: 'CustomDF' object is not subscriptable"
Any idea on how not requiring to access .df first to get the subscription working for the super class?
Thanks!
Try inheriting from the pandas.DataFrame class. Like so:
from pandas import DataFrame
class CustomDF(DataFrame):
def foo(self):
print('foo')
return
a = CustomDF([0,1])
a.foo()