Pandas Styling (background + font) based on String data - Is there a better way? - pandas

Can i combine the lambda func for background color and the lambda func for font color into a single lamdba func? This will be used for a very large dataframe with plenty of different styling, so it would be nice to reduce the code in half.
Any other suggestions for a better way are welcomed
# raw data
df = pd.DataFrame({'Name':['name1', 'name2', 'name3', 'name1', 'name2', 'name3', 'name1', 'name2', 'name3' ],
'Rotation':['ER','PEDI','MAM','PEDI', 'ERJD','PEDI','JMAM','ERSN','ABD']})
#style
df = df.style.apply(lambda x: ["background-color: green" if 'ER' in v else "" for v in x], axis = 1)\
.apply(lambda x: ["color: orange" if 'ER' in v else "" for v in x], axis = 1)\
.apply(lambda x: ["background-color: red" if 'MAM' in v else "" for v in x], axis = 1)\
.apply(lambda x: ["color: yellow" if 'MAM' in v else "" for v in x], axis = 1)
resulting df shown below:

I'd do something like this (Python 3.6+ for f-strings):
def where(x):
bg = ['green', 'red']
fg = ['orange', 'yellow']
ls = ['ER', 'MAM']
for i, y in enumerate(ls):
if y in x:
return f"background-color: {bg[i]}; color: {fg[i]}"
return ''
df.style.applymap(where)

Related

How to expand bars over the month on the x-axis while being the same width?

for i in range(len(basin)):
prefix = "URL here"
state = "OR"
basin_name = basin[i]
df_orig = pd.read_csv(f"{prefix}/{basin_name}.csv", index_col=0)
#---create date x-index
curr_wy_date_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 9, 30),
freq="D",
)
if not calendar.isleap(curr_wy):
print("dropping leap day")
df_orig.drop(["02-29"], inplace=True)
use_cols = ["Median ('91-'20)", f"{curr_wy}"]
df = pd.DataFrame(data=df_orig[use_cols].copy())
df.index = curr_wy_date_rng
#--create EOM percent of median values-------------------------------------
curr_wy_month_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 6, 30),
freq="M",
)
df_monthly_prec = pd.DataFrame(data=df_monthly_basin[basin[i]].copy())
df_monthly_prec.index = curr_wy_month_rng
df_monthly = df.groupby(pd.Grouper(freq="M")).max()
df_monthly["date"] = df_monthly.index
df_monthly["wy_date"] = df_monthly["date"].apply(lambda x: cal_to_wy(x))
df_monthly.index = pd.to_datetime(df_monthly["wy_date"])
df_monthly.index = df_monthly["date"]
df_monthly["month"] = df_monthly["date"].apply(
lambda x: calendar.month_abbr[x.month]
)
df_monthly["wy"] = df_monthly["wy_date"].apply(lambda x: x.year)
df_monthly.sort_values(by="wy_date", axis=0, inplace=True)
df_monthly.drop(
columns=[i for i in df_monthly.columns if "date" in i], inplace=True
)
# df_monthly.index = df_monthly['month']
df_merge = pd.merge(df_monthly,df_monthly_prec,how='inner', left_index=True, right_index=True)
#---Subplots---------------------------------------------------------------
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(df_merge.index, df_merge["Median ('91-'20)"], color="green", linewidth="1", linestyle="dashed", label = 'Median Snowpack')
ax.plot(df_merge.index, df_merge[f'{curr_wy}'], color='red', linewidth='2',label='WY Current')
#------Seting x-axis range to expand bar width for ax2
ax.bar(df_merge.index,df_merge[basin[i]], color = 'blue', label = 'Monthly %')
#n = n + 1
#--format chart
ax.set_title(chart_name[w], fontweight = 'bold')
w = w + 1
ax.set_ylabel("Basin Precipitation Index")
ax.set_yticklabels([])
ax.margins(x=0)
ax.legend()
#plt.xlim(0,9)
#---Setting date format
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
#---EXPORT
plt.show()
End result desired: Plotting both the monthly dataframe (df_monthly_prec) with the daily dataframe charting only monthly values (df_monthly). The bars for the monthly DataFrame should ideally span the whole month on the chart.
I have tried creating a secondary axis, but had trouble aligning the times for the primary and secondary axes. Ideally, I would like to replace plotting df_monthly with df (showing all daily data instead of just the end-of-month values within the daily dataset).
Any assistance or pointers would be much appreciated! Apologies if additional clarification is needed.

How to merge pandas DF on imperfect match?

I'm trying to merge/join x and y dataframes based on an exact match of the company columns and a partial match of some degree on the name columns.
Other than looking at the values returned by SequenceMatcher(None, x_name, y_name).ratio(), which were always above .8 in my case, I haven't tried much that warrants mentioning.
x = pd.DataFrame([{'id': 1, 'name': 'Robert Jackson', 'company': 'Test inc.', 'tenure': 6},
{'id': 2, 'name': 'William Johnson', 'company': 'Test inc.', 'tenure': 6}]).set_index('id')
y = pd.DataFrame([{'id': 4, 'name': 'Bob Jackson', 'company': 'Test inc.', 'job': 'desk'},
{'id': 5, 'name': 'Willy Johnson', 'company': 'Test inc.', 'job': 'desk'}]).set_index('id')
goal = pd.DataFrame([{'x_id': 1, 'y_id': 4, 'x_name': 'Robert Jackson', 'y_name': 'Bob Jackson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'},
{'x_id': 2, 'y_id': 5, 'x_name': 'William Johnson', 'y_name': 'Willy Johnson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'}])
Is something like this plausible? I'd appreciate any feedback, thank you.
Great question! I'm following to see other answers as I've been doing a lot of similar work lately. One inefficient method I've taken is to use fuzzywuzzy based on a threshold.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1
The solution that I used was:
from difflib import SequenceMatcher
x['merge_name'] = x['name']
x['merge_comp'] = x['company']
for a, b in x[['name', 'company']].values:
for ixb, (c,d) in enumerate(y[['name', 'company']].values):
if SequenceMatcher(None,a,c).ratio() >= .8:
y.loc[ixb,'merge_name'] = a
if SequenceMatcher(None,b,d).ratio() == 1:
y.loc[ixb,'merge_comp'] = b
goal = pd.merge(x,y, on=['merge_name', 'merge_comp'])
This function worked while passing arbitrary number of columns:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=False, post_drop=True):
if reset_index:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
flag = 0
merge_columns = []
r = len(columns)
for f in range(r):
df1[prefix + columns[flag]] = df1[columns[flag]]
merge_columns.append(prefix + columns[flag])
flag =+ 1
flag = 0
for f in range(r):
for col_1 in df1[columns[flag]].values:
for index, col_2 in enumerate(df2[columns[flag]].values):
print(type(col_2))
if SequenceMatcher(None,str(col_1),str(col_2)).ratio() >= ratios[flag]:
df2.loc[index, merge_columns[flag]] = col_1
flag =+ 1
df = pd.merge(df1,df2, on=merge_columns)
if post_drop:
df1.drop(columns=merge_columns, inplace=True)
df2.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x, y, columns=['name', 'company'], ratios=[.8, 1], reset_index=True)
This function worked for passing exactly 2 columns/ratios:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=True, post_drop=True):
df1_c = df1.copy()
df2_c = df2.copy()
if reset_index:
df1_c.reset_index(inplace=True)
df2_c.reset_index(inplace=True)
df1_c[prefix + columns[0]] = df1_c[columns[0]]
df1_c[prefix + columns[1]] = df1_c[columns[1]]
merge_columns = [prefix + columns[0], prefix + columns[1]]
for col_1, col_2 in df1_c[[columns[0], columns[1]]].values:
for index, (col_3, col_4) in enumerate(df2_c[[columns[0], columns[1]]].values):
if SequenceMatcher(None, str(col_1), str(col_3)).ratio() >= ratios[0]:
df2_c.loc[index, merge_columns[0]] = col_1
if SequenceMatcher(None, str(col_2), str(col_4)).ratio() >= ratios[1]:
df2_c.loc[index, merge_columns[1]] = col_2
df = pd.merge(df1_c, df2_c, on=merge_columns)
if post_drop:
df.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x,y,columns=['name', 'company'], ratios=[.8,1])

Collapsing a PANDAs dataframe into a single column of all items and their occurances

I have a data frame consisting of a mixture of NaN's and strings e.g
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
ddf = pd.DataFrame(data)
I want to
1:count the total number of items and put in a new data frame e.g
NaN=2
tree=5
car=2
fish=1
cat=1
dog=1
2:Count the total number of items when compared to a separate longer list (column of a another data frame, e.g
df['compare'] =
NaN
tree
car
fish
cat
dog
rabbit
Pear
Orange
snow
rain
Thanks
Jason
For the first question:
from collections import Counter
data = {
"String1": ["NaN", "tree", "car", "tree"],
"String2": ["cat", "dog", "car", "tree"],
"String3": ["fish", "tree", "NaN", "tree"],
}
ddf = pd.DataFrame(data)
a = Counter(ddf.stack().tolist())
df_result = pd.DataFrame(dict(a), index=['Count']).T
df = pd.DataFrame({'vals':['NaN', 'tree', 'car', 'fish', 'cat', 'dog', 'rabbit', 'Pear', 'Orange', 'snow', 'rain']})
df_counts = df.vals.map(df_result.to_dict()['Count'])
THis should do :)
You can use the following code for count of items over all data frame.
import pandas as pd
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
df = pd.DataFrame(data)
def get_counts(df: pd.DataFrame) -> dict:
res = {}
for col in df.columns:
vc = df[col].value_counts().to_dict()
for k,v in vc.items():
if k in res:
res[k] += v
else:
res[k] = v
return res
counts = get_counts(df)
Output
>>> print(counts)
{'tree': 5, 'car': 2, 'NaN': 2, 'cat': 1, 'dog': 1, 'fish': 1}

How can the edge colors of individual matplotlib histograms be set?

I've got a rough and ready function that can be used to compare two sets of values using histograms:
I want to set the individual edge colors of each of the histograms in the top plot (much as how I set the individual sets of values used for each histogram). How could this be done?
import os
import datavision
import matplotlib.pyplot
import numpy
import shijian
def main():
a = numpy.random.normal(2, 2, size = 120)
b = numpy.random.normal(2, 2, size = 120)
save_histogram_comparison_matplotlib(
values_1 = a,
values_2 = b,
label_1 = "a",
label_2 = "b",
normalize = True,
label_ratio_x = "measurement",
label_y = "",
title = "comparison of a and b",
filename = "histogram_comparison_1.png"
)
def save_histogram_comparison_matplotlib(
values_1 = None,
values_2 = None,
filename = None,
directory = ".",
number_of_bins = None,
normalize = True,
label_x = "",
label_y = None,
label_ratio_x = None,
label_ratio_y = "ratio",
title = "comparison",
label_1 = "1",
label_2 = "2",
overwrite = True,
LaTeX = False,
#aspect = None,
font_size = 20,
color_1 = "#3861AA",
color_2 = "#00FF00",
color_3 = "#7FDADC",
color_edge_1 = "#3861AA", # |<---------- insert magic for these
color_edge_2 = "#00FF00", # |
alpha = 0.5,
width_line = 1
):
matplotlib.pyplot.ioff()
if LaTeX is True:
matplotlib.pyplot.rc("text", usetex = True)
matplotlib.pyplot.rc("font", family = "serif")
if number_of_bins is None:
number_of_bins_1 = datavision.propose_number_of_bins(values_1)
number_of_bins_2 = datavision.propose_number_of_bins(values_2)
number_of_bins = int((number_of_bins_1 + number_of_bins_2) / 2)
if filename is None:
if title is None:
filename = "histogram_comparison.png"
else:
filename = shijian.propose_filename(
filename = title + ".png",
overwrite = overwrite
)
else:
filename = shijian.propose_filename(
filename = filename,
overwrite = overwrite
)
values = []
values.append(values_1)
values.append(values_2)
bar_width = 0.8
figure, (axis_1, axis_2) = matplotlib.pyplot.subplots(
nrows = 2,
gridspec_kw = {"height_ratios": (2, 1)}
)
ns, bins, patches = axis_1.hist(
values,
color = [
color_1,
color_2
],
normed = normalize,
histtype = "stepfilled",
bins = number_of_bins,
alpha = alpha,
label = [label_1, label_2],
rwidth = bar_width,
linewidth = width_line,
#edgecolor = [color_edge_1, color_edge_2] <---------- magic here? dunno
)
axis_1.legend(
loc = "best"
)
bars = axis_2.bar(
bins[:-1],
ns[0] / ns[1],
alpha = 1,
linewidth = 0, #width_line
width = bins[1] - bins[0]
)
for bar in bars:
bar.set_color(color_3)
axis_1.set_xlabel(label_x, fontsize = font_size)
axis_1.set_ylabel(label_y, fontsize = font_size)
axis_2.set_xlabel(label_ratio_x, fontsize = font_size)
axis_2.set_ylabel(label_ratio_y, fontsize = font_size)
#axis_1.xticks(fontsize = font_size)
#axis_1.yticks(fontsize = font_size)
#axis_2.xticks(fontsize = font_size)
#axis_2.yticks(fontsize = font_size)
matplotlib.pyplot.suptitle(title, fontsize = font_size)
if not os.path.exists(directory):
os.makedirs(directory)
#if aspect is None:
# matplotlib.pyplot.axes().set_aspect(
# 1 / matplotlib.pyplot.axes().get_data_ratio()
# )
#else:
# matplotlib.pyplot.axes().set_aspect(aspect)
figure.tight_layout()
matplotlib.pyplot.subplots_adjust(top = 0.9)
matplotlib.pyplot.savefig(
directory + "/" + filename,
dpi = 700
)
matplotlib.pyplot.close()
if __name__ == "__main__":
main()
You may simply plot two different histograms but share the bins.
import numpy as np; np.random.seed(3)
import matplotlib.pyplot as plt
a = np.random.normal(size=(89,2))
kws = dict(histtype= "stepfilled",alpha= 0.5, linewidth = 2)
hist, edges,_ = plt.hist(a[:,0], bins = 6,color="lightseagreen", label = "A", edgecolor="k", **kws)
plt.hist(a[:,1], bins = edges,color="gold", label = "B", edgecolor="crimson", **kws)
plt.show()
Use the lists of Patches objects returned by the hist() function.
In your case, you have two datasets, so your variable patches will be a list containing two lists, each with the Patches objects used to draw the bars on your plot.
You can easily set the properties on all of these objects using the setp() function. For example:
a = np.random.normal(size=(100,))
b = np.random.normal(size=(100,))
c,d,e = plt.hist([a,b], color=['r','g'])
plt.setp(e[0], edgecolor='k', lw=2)
plt.setp(e[1], edgecolor='b', lw=3)

pandas: map color argument by multidict

I would like to map a color to each row in the dataframe as a function of two columns. It would be much easier with just one column as argument. But how can I achieve this with two columns ?
What I have done so far:
a = np.random.rand(3,10)
i = [[30,10], [10, 30], [60, 60]]
names = ['a', 'b']
index = pd.MultiIndex.from_tuples(i, names = names)
df = pd.DataFrame(a, index=index).reset_index()
c1 = plt.cm.Greens(np.linspace(0.2,0.8,3))
c2 = plt.cm.Blues(np.linspace(0.2,0.8,3))
#c3 = plt.cm.Reds(np.linspace(0.2,0.8,3))
color = np.vstack((c1,c2))
a = df.a.sort_values().values
b = df.b.sort_values().values
mapping = dict()
for i in range(len(a)):
mapping[a[i]] = {}
for ii in range(len(b)):
mapping[a[i]][b[ii]] = color[i+ii]
Maybe something similar to df['color'] = df.apply(lamda x: mapping[x.a][x.b]) ?
Looks like you answered your own question. Apply can happen across the rows by changing the axis argument to 1. df['color'] = df.apply(lambda x: mapping[x.a][x.b], axis =1)