Pandas dataframe dump to excel with color formatting - pandas

I have a large pandas dataframe df as:
Sou ATC P25 P75 Avg
A 11 9 15 10
B 6.63 15 15 25
C 6.63 5 10 8
I want to print this datamframe to excel file but I want to apply formatting to each row of the excel file such that following rules are applied to cells in ATC and Avg columns:
colored in red if value is less than P25
colored in green if value is greater than P75
colored in yellow if value is between P25 and P75
Sample display in excel is as follows:
I am not sure how to approach this.

You can use style.Styler.apply with DataFrame of styles with numpy.select for filling by masks created by DataFrame.lt and
DataFrame.gt:
def color(x):
c1 = 'background-color: red'
c2 = 'background-color: green'
c3 = 'background-color: yellow'
c = ''
cols = ['ATC','Avg']
m1 = x[cols].lt(x['P25'], axis=0)
m2 = x[cols].gt(x['P75'], axis=0)
arr = np.select([m1, m2], [c1, c2], default=c3)
df1 = pd.DataFrame(arr, index=x.index, columns=cols)
return df1.reindex(columns=x.columns, fill_value=c)
df.style.apply(color,axis=None).to_excel('format_file.xlsx', index=False, engine='openpyxl')

Related

Subplots with counter like legends

I have written plot_dataframe() to create two subplots (one for line chart and another for histogram bar chart) for a dataframe that is passed via argument.
Then I call this function from plot_kernels() with multiple dataframs.
def plot_dataframe(df, cnt):
row = df.iloc[0].astype(int) # First row in the dataframe
plt.subplot(2, 1, 1)
row.plot(legend=cnt) # Line chart
plt.subplot(2, 1, 2)
df2 = row.value_counts()
df2.reindex().plot(kind='bar', legend=cnt) # Histogram
def plot_kernels(mydict2):
plt.figure(figsize=(20, 15))
cnt=1
for key in my_dict2:
df = my_dict2[key]
plot_dataframe(df, cnt)
cnt = cnt + 1
plt.show()
The dictionary looks like
{'K1::foo(bar::z(x,u))': Value Value
0 10 2
1 5 2
2 10 2, 'K3::foo(bar::y(z,u))': Value Value
0 6 12
1 7 13
2 8 14}
And based on the values in row[0], [10,2] are shown in blue line and [6,12] are shown in orange line. For histogram, they are similar. As you can see the legends in the subplots are shown as 0 in the figure. I expect to see 1 and 2. How can I fix that?
Change legend to label, then force the legend after you plot everything:
def plot_dataframe(df, cnt,axes):
row = df.iloc[0].astype(int) # First row in the dataframe
row.plot(label=cnt, ax=axes[0]) # Line chart -- use label, not legend
df2 = row.value_counts()
df2.plot(kind='bar', ax=axes[1], label=cnt) # Histogram
def plot_kernels(d):
# I'd create the axes first and pass to the plot function
fig,axes = plt.subplots(2,1, figsize=(20, 15))
cnt=1
for key in d:
df = d[key]
plot_dataframe(df, cnt, axes=axes)
cnt = cnt + 1
# render the legend
for ax in axes:
ax.legend()
plt.show()
Output:

read_excel modifies unexpected data

Is it possible that the pandas function read_excel is modifing data from the excel?
It seems it changes for example TRUE to 1 and FALSE to False.
I use this code
df = pd.read_excel(DEFAULT_PATH_2_XLSX_FILE, , dtype=str)
There is a good explanation to that. Pandas read the raw values of cell not the shown values:
df = pd.read_excel('data.xlsx', dtype=str, header=None)
print(df)
# Output
0 1
0 1 0 # A1, B1
1 1 0 # A2, B2
2 1 0 # A3, B3
In this screenshot, this is how I entered values:
A1: 1
B1: 0
A2: 1 then formatted as boolean value
B2: 0 then formatted as boolean value
A3: TRUE (type as it)
B3: FALSE (type as it)

Merge rows with same id, different vallues in 1 column to multiple columns

what i have length can be of different values/ so somethimes 1 id has 4 rows with different values in column val, the other columns have all the same values
df1 = pd.DataFrame({'id':[1,1,1,2,2,2,3,3,3], 'val': ['06123','nick','#gmail','06454','abey','#gmail','06888','sisi'], 'media': ['nrc','nrc','nrc','nrc','nrc','nrc','nrc','nrc']})
what i need
id kolom 1 kolom2 kolom 3 media
1 06123 nick #gmail nrc
2 06454 abey #gmail nrc
3 6888 sisi None nrc
I hope I gave a good example, in the corrected way, thanks for the help
df2 = df1.groupby('id').agg(list)
df2['col 1'] = df2['val'].apply(lambda x: x[0] if len(x) > 0 else 'None')
df2['col 2'] = df2['val'].apply(lambda x: x[1] if len(x) > 1 else 'None')
df2['col 3'] = df2['val'].apply(lambda x: x[2] if len(x) > 2 else 'None')
df2['media'] = df2['media'].apply(lambda x: x[0] if len(x) > 0 else 'None')
df2.drop(columns='val')
Here is another way. Since your original dataframe doesn't have lists with the same length (which will get you a ValueError, you can define it as:
data = {"id":[1,1,1,2,2,2,3,3,3],
"val": ["06123","nick","#gmail","06454","abey","#gmail","06888","sisi"],
"media": ["nrc","nrc","nrc","nrc","nrc","nrc","nrc","nrc"]}
df = pd.DataFrame.from_dict(data, orient="index")
df = df.transpose()
>>> df
id val media
0 1 06123 nrc
1 1 nick nrc
2 1 #gmail nrc
3 2 06454 nrc
4 2 abey nrc
5 2 #gmail nrc
6 3 06888 nrc
7 3 sisi nrc
8 3 NaN NaN
Afterwards, you can replace with np.nan values with an empty string, so that you can groupby your id column and join the values in val separated by a ,.
df = df.replace(np.nan, "", regex=True)
df_new = df.groupby(["id"])["val"].apply(lambda x: ",".join(x)).reset_index()
>>> df_new
id val
0 1.0 06123,nick,#gmail
1 2.0 06454,abey,#gmail
2 3.0 06888,sisi,
Then, you only need to transform the new val column into 3 columns by splitting the string inside, with any method you want. For example,
new_cols = df_new["val"].str.split(",", expand=True) # Good ol' split
df_new["kolom 1"] = new_cols[0] # Assign to new columns
df_new["kolom 2"] = new_cols[1]
df_new["kolom 3"] = new_cols[2]
df_new.drop("val", 1, inplace=True) # Delete previous val
df_new["media"] = "nrc" # Add the media column again
df_new = df_new.replace("", np.nan, regex=True) # If necessary, replace empty string with np.nan
>>> df_new
id kolom 1 kolom 2 kolom 3 media
0 1.0 06123 nick #gmail nrc
1 2.0 06454 abey #gmail nrc
2 3.0 06888 sisi NaN nrc

Subset two consecutive event occurrence in pandas

I'm trying to get a subset of my data whenever there is consecutive occurrence of an two events in that order. The event is time-stamped. So every time there are continuous 2's and then continuous 3's, I want to subset that to a dataframe and append it to a dictionary. The following code does that but I have to apply this to a very large dataframe of more than 20 mil obs. This is extremely slow using iterrows. How can I make this fast?
df = pd.DataFrame({'Date': [101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122],
'Event': [1,1,2,2,2,3,3,1,3,2,2,3,1,2,3,2,3,2,2,3,3,3]})
dfb = pd.DataFrame(columns = df.columns)
C = {}
f1 = 0
for index, row in df.iterrows():
if ((row['Event'] == 2) & (3 not in dfb['Event'].values)):
dfb = dfb.append(row)
f1 =1
elif ((row['Event'] == 3) & (f1 == 1)):
dfb = dfb.append(row)
elif 3 in dfb['Event'].values:
f1 = 0
C[str(dfb.iloc[0,0])] = dfb
del dfb
dfb = pd.DataFrame(columns = df.columns)
if row['Event'] == 2:
dfb = dfb.append(row)
f1 =1
else:
f1=0
del dfb
dfb = pd.DataFrame(columns = df.columns)
Edit: The desired output is basically a dictionary of the subsets shown in the imagehttps://i.stack.imgur.com/ClWZs.png
If you want to accerlate, you should vectorize your code. You could try it like this (df is the same with your code):
vec = df.copy()
vec['Event_y'] = vec['Event'].shift(1).fillna(0).astype(int)
vec['Same_Flag'] = float('nan')
vec.Same_Flag.loc[(vec['Event_y'] == vec['Event']) & (vec['Event'] != 1)] = 1
vec.dropna(inplace=True)
vec.loc[:, ('Date', 'Event')]
Output is:
Date Event
3 104 2
4 105 2
6 107 3
10 111 2
18 119 2
20 121 3
21 122 3
I think that's close to what you need. You could improve based on that.
I'm not understand why date 104, 105, 107 are not counted.

How to split dict in dataframe to many columns

I'm using dataframe. How to split dict list to many columns?
This is for a junior dataprocessor. In the past, I've tried on many ways.
import pandas as pd
l = [{'a':1,'b':2},{'a':3,'b':4}]
data = [{'key1':'x','key2':'y','value':l}]
df = pd.DataFrame(data)
data1 = {'key1':['x','x'],'key2':['y','y'],'a':[1,3],'b':[2,4]}
df1 = pd.DataFrame(data1)
df1 is what I need.
comprehension
d1 = df.drop('value', axis=1)
co = d1.columns
d2 = df.value
pd.DataFrame([
{**dict(zip(co, tup)), **d}
for tup, D in zip(zip(*map(d1.get, d1)), d2)
for d in D
])
a b key1 key2
0 1 2 x y
1 3 4 x y
Explode
See post on explode
This is a tad different but close
idx = df.index.repeat(df.value.str.len())
val = np.concatenate(df.value).tolist()
d0 = pd.DataFrame(val)
df.drop('value', axis=1).loc[idx].reset_index(drop=True).join(d0)
a b key1 key2
0 1 2 x y
1 3 4 x y