Plotting stacked barchart with pandas of multiple columns grouped - pandas

I have two dataframes which I need to get the difference and then plot one of them on top of this difference. Here is a minimal example:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.DataFrame([[2,5,7,6,7],[4,4,4,4,3],[8,8,7,3,4],[16,10,12,13,16]], columns=["N", "A", "B", "C", "D"])
df2 = pd.DataFrame([[2,1,3,6,5],[4,1,2,3,2],[8,2,2,3,3],[16,8,10,3,11]], columns=["N", "A", "B", "C", "D"])
dfDiff = df1 - df2
dfDiff['N'] = df1['N']
# Individual barchart
colors = ['#6c8ebf', '#82b366', '#F7A01D', '#9876a7']
df1.set_index('N')[["A", "B", "C", "D"]].plot.bar(color=colors)
df2.set_index('N')[["A", "B", "C", "D"]].plot.bar(color=colors)
dfStacked = pd.DataFrame(columns=["N", "A", "A_diff", "B", "B_diff"])
dfStacked["N"] = df2["N"]
dfStacked["A"] = df2["A"]
dfStacked["B"] = df2["B"]
dfStacked["C"] = df2["C"]
dfStacked["D"] = df2["D"]
dfStacked["A_diff"] = dfDiff["A"]
dfStacked["B_diff"] = dfDiff["B"]
dfStacked["C_diff"] = dfDiff["C"]
dfStacked["D_diff"] = dfDiff["D"]
dfStacked.set_index('N').plot.bar(stacked=True)
plt.show()
The dataframes look like this:
The thing is that now the new stacked one ends up with everything merged. I want to have "A" stacked with "A_diff", "B", stacked with "B_diff", "C" stacked with "C_diff" and "D" stacked with "D_diff".
For example, I changed the code to do it with "A" and "A_diff" as dfStacked.set_index('N')[["A", "A_diff"]].plot.bar(stacked=True) which looks correct, but I want A,B,C and D grouped by N like in the first two figures.
Do I need a new dataframe for this, like dfStacked? If so, in which form should the content be added? And how can I keep the same colors but add hatch="/" only for the "top" stacked bar?
Would it be better to have the dataframe as below?:
df3 = pd.DataFrame(columns=["N", "Algorithm", "df1", "dfDiff"])
df3.loc[len(df3)] = [2, "A", 20, 10]
df3.loc[len(df3)] = [2, "A", 1, 4]
df3.loc[len(df3)] = [4, "A", 2, 3]
df3.loc[len(df3)] = [4, "A", 3, 4]
df3.loc[len(df3)] = [2, "B", 1, 3]
df3.loc[len(df3)] = [2, "B", 2, 4]
df3.loc[len(df3)] = [4, "B", 3, 3]
df3.loc[len(df3)] = [4, "B", 4, 2]
But how to group them by "N" and "Algorithm"? I mean, each row corresponds to one bar, just they should be grouped by "N" with all the "Algorithms" and the two last columns are the two "parts" of each bar. It would be good that the colors match the first two figures (for the "Algorithms") but the top part of the bar has hatch="/" for example.
Thanks for the help

I'll start from df1, df2 and get dfStacked in a slightly different way:
import pandas as pd
df1 = pd.DataFrame(
[
[2,5,7,6,7],
[4,4,4,4,3],
[8,8,7,3,4],
[16,10,12,13,16]
],
columns=["N", "A", "B", "C", "D"]
).set_index('N')
df2 = pd.DataFrame(
[
[2,1,3,6,5],
[4,1,2,3,2],
[8,2,2,3,3],
[16,8,10,3,11]
],
columns=["N", "A", "B", "C", "D"]
).set_index('N')
dfStacked = pd.concat(
[df1, df1-df2],
axis=1,
keys=['raw','diff']
).reorder_levels([1,0], axis=1)
Now we have this DataFrame:
To draw this data in a bar chart stacked by the first level we could make use of two DataFrame.plot's features - ax and bottom. The first one is the location of the axes where the barplot should be drawn, the second one is for the values where the bottom line of the bars should start. For details run help(plt.bar) to read about bottom and help(pd.DataFrame.plot) to read about ax.
import matplotlib.pyplot as plt
from matplotlib.colors import TABLEAU_COLORS
plt.figure(figsize=(10,7))
ax = plt.gca()
names = dfStacked.columns.levels[0]
n = len(names)
color = iter(TABLEAU_COLORS)
w = 1/(n+2) # width
h = '/'*5 # hatch for diff values
for i, name in enumerate(names):
c = next(color) # color
p = n/2 - i # position
dfStacked[name]['raw'].plot.bar(
ax=ax,
position=p,
width=w,
color=c,
label=f'{name} raw'
)
dfStacked[name]['diff'].plot.bar(
ax=ax,
bottom=dfStacked[name]['raw'],
hatch=h,
position=p,
width=w,
color=c,
label=f'{name} diff'
)
ax.set_xlim([-1, n])
ax.tick_params(axis='x', rotation=0)
ax.legend();
And here's the output:

IIUC, try using position parameter in pd.DataFrame.plot.bar:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df1 = pd.DataFrame([[2,5,7,6,7],[4,4,4,4,3],[8,8,7,3,4],[16,10,12,13,16]], columns=["N", "A", "B", "C", "D"])
df2 = pd.DataFrame([[2,1,3,6,5],[4,1,2,3,2],[8,2,2,3,3],[16,8,10,3,11]], columns=["N", "A", "B", "C", "D"])
dfDiff = df1 - df2
dfDiff['N'] = df1['N']
dfStacked = pd.DataFrame(columns=["N", "A", "A_diff", "B", "B_diff"])
dfStacked["N"] = df2["N"]
dfStacked["A"] = df2["A"]
dfStacked["B"] = df2["B"]
dfStacked["C"] = df2["C"]
dfStacked["D"] = df2["D"]
dfStacked["A_diff"] = dfDiff["A"]
dfStacked["B_diff"] = dfDiff["B"]
dfStacked["C_diff"] = dfDiff["C"]
dfStacked["D_diff"] = dfDiff["D"]
dfStacked = dfStacked.set_index('N')
colors = ['red', 'slateblue', 'lightseagreen', 'orange']
colors_c = ['darkred', 'blue', 'darkgreen', 'darkorange']
ax = dfStacked.filter(like='A').plot.bar(stacked=True, position=2, width=.1, color=[colors[0], colors_c[0]], edgecolor='w', alpha=.8)
dfStacked.filter(like='B').plot.bar(stacked=True, ax=ax, position=1, width=.1, color=[colors[1], colors_c[1]], edgecolor='w', alpha=.8)
dfStacked.filter(like='C').plot.bar(stacked=True, ax=ax, position=0, width=.1, color=[colors[2], colors_c[2]], edgecolor='w', alpha=.8)
dfStacked.filter(like='D').plot.bar(stacked=True, ax=ax, position=-1, width=.1, color=[colors[3], colors_c[3]], edgecolor='w', alpha=.8)
ax.set_xlim(-.5,3.5)
plt.legend(loc='upper center', ncol=4, bbox_to_anchor=(.5, 1.2))
plt.show()
Output:

Related

matplotlib - plot merged dataframe with group bar

I try to plot a grouped bar chart from a merged dataframe. below code the bar is stacked, how can I put it side by side just like a grouped bar chart?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
df1 = pd.DataFrame({
'key': ['A', 'B', 'C', 'D'],
'value':[ 10 ,6, 6, 8]})
df2 = pd.DataFrame({
'key': ['B', 'D', 'A', 'F'],
'value':[ 3, 5, 5, 7]})
df3 = pd.merge(df1, df2, how='inner', on=['key'])
print(df1)
print(df2)
print(df3)
fig, ax = plt.subplots(figsize=(12, 8))
b1 = ax.bar(df3['key'],df3['value_x'])
b2 = ax.bar(df3['key'],df3['value_y'])
pngname = "demo.png"
fig.savefig(pngname, dpi=fig.dpi)
print("[[./%s]]"%(pngname))
Current output:
The problem is that the x axis data is the same, in your case it aren't numbers, it are the keys: "A", "B", "C". So matplotlib stacks them one onto another.
There's a simple way around it, as some tutorials online show https://www.geeksforgeeks.org/create-a-grouped-bar-plot-in-matplotlib/.
So, what you do is basically enumerate the keys, i.e. A=1, B=2, C=3. After this, choose your desired bar width, I chose 0.4 for example. And now, shift one group of bars to the left by bar_width/2, and shift the other one to the right by bar_width/2.
Perhaps the code explains it better than I did:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
df1 = pd.DataFrame({
'key': ['A', 'B', 'C', 'D'],
'value':[ 10 ,6, 6, 8]})
df2 = pd.DataFrame({
'key': ['B', 'D', 'A', 'F'],
'value':[ 3, 5, 5, 7]})
df3 = pd.merge(df1, df2, how='inner', on=['key'])
fig, ax = plt.subplots(figsize=(12, 8))
# modifications
x = np.arange(len(df3['key'])) # enumerate the keys
bar_width = 0.4 # choose bar length
b1 = ax.bar(x - bar_width/2,df3['value_x'], width=bar_width, label='value_x') # shift x values left
b2 = ax.bar(x + bar_width/2,df3['value_y'], width=bar_width, label='value_y') # shift x values right
plt.xticks(x, df3['key']) # replace x axis ticks with keys from df3.
plt.legend(['value_x', 'value_y'])
plt.show()
Result:

How to align a legend relative to a GridSpec cell?

I am creating a figure like this:
fig = plt.figure(figsize = (7, 8))
outer_grid = gridspec.GridSpec(2, 1, height_ratios = [2, 1])
inner_grid1 = gridspec.GridSpecFromSubplotSpec(4, 3, subplot_spec=outer_grid[0])
inner_grid2 = gridspec.GridSpecFromSubplotSpec(2, 3, subplot_spec=outer_grid[1])
Now I would like to have one legend for all plots in inner_grid1 and a separate legend for all plots in inner_grid2. And I would like those legends to be placed nicely, even though they are higher than a single plot, and cannot have more than one column to not make the figure too wide.
Here is an example where I tried to align the legends with trial and error with method 2 below, however this took ages to make.
So I see three options to achieve this, none of which work:
Place the legend as part of an Axes object, but manually move it outside of the actual plot using axes.legend([...], bbox_to_anchor=(x, y)). This does not work when the legend is higher as a single plot, because it rescales the plots to fit the legend into its grid cell.
Place the legend globally on the Figure object. This works, but makes the correct placement really hard. I cannot use loc = "center right", since it centers it for the full figure instead of just the inner_grid1 or inner_grid2 plots.
Place the legend locally on the GridSpecFromSubplotSpec object. This would be perfect. However there is no method to create a legend on a GridSpecFromSubplotSpec or related classes, and the pyplot.legend method misses parameters to restrict the loc to parts of a grid.
Is there a way to place a legend as described?
As requested, a small code example generating something similar as desired.
This example uses method 2:
#!/usr/bin/env python3
import pandas as pd, seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
GENOMES = ["spneumoniae", "ecoliK12", "scerevisiae", "celegans", "bmori", "hg38"]
fig = plt.figure(figsize = (7, 8))
outer_grid = gridspec.GridSpec(2, 1, height_ratios = [2, 1])
inner_grid1 = gridspec.GridSpecFromSubplotSpec(4, 3, subplot_spec=outer_grid[0])
inner_grid2 = gridspec.GridSpecFromSubplotSpec(2, 3, subplot_spec=outer_grid[1])
# plots are in sets of six, 2 rows by 3 columns each
for index, genome in enumerate(GENOMES):
data = pd.DataFrame({"x": [0, 1, 2, 3, 0, 1, 2, 3], "y": [1, 0, 3, 2, 1, 0, 3, 2], "hue": ["a", "a", "a", "a", "b", "b", "b", "b"]})
# first set of six
ax1 = plt.Subplot(fig, inner_grid1[index])
ax1 = sns.lineplot(data = data, x = "x", y = "y", hue = "hue", ax = ax1)
ax1.set_xlabel("")
ax1.set_ylabel("")
if index == 2:
ax1.legend()
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc = "center left", title = "", bbox_to_anchor=(0.9, 2/3 - 0.03))
ax1.legend([], [], loc = "lower center", title = f"{genome}")
fig.add_subplot(ax1)
# second set of six
ax2 = plt.Subplot(fig, inner_grid1[index + 6])
ax2 = sns.lineplot(data = data, x = "x", y = "y", hue = "hue", ax = ax2)
ax2.set_xlabel("")
ax2.set_ylabel("")
ax2.legend([], [], loc = "upper center", title = f"{genome}")
fig.add_subplot(ax2)
#third set of six
ax3 = plt.Subplot(fig, inner_grid2[index])
ax3 = sns.lineplot(data = data, x = "x", y = "y", hue = "hue", ax = ax3)
ax3.set_xlabel("")
ax3.set_ylabel("")
if index == 2:
ax3.legend(["#unitigs", "avg. unitig len."])
handles, labels = ax3.get_legend_handles_labels()
fig.legend(handles, labels, loc = "center left", title = "", bbox_to_anchor=(0.9, 1/6 + 0.05))
ax3.legend([], [], loc = "upper center", title = f"{genome}")
fig.add_subplot(ax3)
plt.savefig("stackoverflow_test.pdf", bbox_inches="tight")

order plotly legend by custom order

I have a plotly figure to which I add values from two dataframes, df_a and df_b. I display both dataframes in different subplots, but they share a legend. How can I order the shared legend? The expected order is: [a, b, c, f, g]. Please see below for the current implementation - it seems to pick the order from input data in some way.
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
buckets = ["a", "b", "c", "f", "g"]
fig_subplots = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=["df_a", "df_b"])
def get_chart(df, buckets, d_legend):
fig = go.Figure()
colorscale = px.colors.qualitative.Pastel
i_color = 0
unique_dates = [str(dt) for dt in df.date.unique()]
for bucket in sorted(buckets, reverse=True):
y_values = df[df['variable'] == bucket]["value"].to_list()
enable_legend_for_bucket = False if bucket in d_legend else True
fig.add_trace(go.Bar(
name=bucket,
x=unique_dates,
y=y_values,
marker_color=colorscale[i_color],
legendgroup=bucket,
showlegend=enable_legend_for_bucket
))
if len(y_values) != 0:
d_legend[bucket] = True # store first time this bucket was populated for legend
i_color += 1
fig.update_layout(barmode="stack")
return fig, d_legend
list_df = [df_a.melt(id_vars="date"), df_b.melt(id_vars="date")]
d_legend = {}
iRow = 1
for df in list_df:
fig, d_legend = get_chart(df, buckets, d_legend)
for el in fig['data']:
fig_subplots.append_trace(el, iRow, 1)
iRow += 1
fig_subplots.update_layout(barmode='stack', legend={'traceorder':'normal'})
Without finding an easy built-in method to order the legend, and to have the same stacking of values with the legend sorted in any desired order.
new_order = sorted(['a','b','c','g','f'],reverse=True)
print(new_order)
ordered_object_list =[]
for i in new_order:
item = [obj for obj in fig_subplots.data if obj['name'] == i]
ordered_object_list += item
fig_subplots.data = ordered_object_list
fig_subplots.update_layout(legend={'traceorder':'reversed'})
There is also an approach with plotly express:
import pandas as pd
import plotly.express as px
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_a["id"] = "df_a"
df_b["id"] = "df_b"
df_c = df_a.append(df_b)
px.bar(df_c.melt(id_vars=["date", "id"]), x="date", y="value", color="variable", facet_col="id", range_y=[0,20], facet_col_wrap=1)

Formatting Date labels using Seaborn FacetGrid

I want to make a facet grid with variable names as the columns, and departments as the rows, and each small chart is a scatter chart of y=value and x=date
My data is sort of like this:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import matplotlib.dates as mdates
import random
datelist = pd.date_range(start="march 1 2020", end="may 20 2020", freq="w").tolist()
varlist = ["x", "y", "z", "x", "y", "z", "x", "y", "z", "x", "y", "z"]
deptlist = ["a", "a", "b", "a", "a", "b", "a", "a", "b", "a", "a", "b"]
vallist = random.sample(range(10, 30), 12)
df = pd.DataFrame({'date': datelist, 'value': vallist, 'variable': varlist, 'department': deptlist})
I want to make a facet grid with variable names as the columns, and departments as the rows, and each small chart is a scatter chart of y=value and x=date
Here is what I have so far. It almost works, except I want to see dates along the bottom that are not squished together, so I would like to see "3/1 4/1 5/1" instead of full dates. But I can't figure out how to format it.
plt.style.use('seaborn-darkgrid')
xformatter = mdates.DateFormatter("%m-%d")
g = sns.FacetGrid(df2, row="department", col="variable", sharey='row')
g = g.map(plt.plot, "date", "value", marker='o', markersize=0.7)
datelist = pd.date_range(start="march 1 2020", end="june 1 2020", freq="MS").tolist()
g.set(xticks=datelist)
This is pretty close, but notice the dates along the bottom x axes. They are all scrunched together. That's why I tried to use a special date formatter but couldn't get it to work. Really what I would like is that each date shows up as mon-dd and that I can control how many tick marks appear there.
You can access the Axes object of the FacetGrid as g.axes (a 2D array). You could iterate over this array and change the properties of all the axes, but in your case you have sharex=True (the default), that means that changing the xaxis of one of the subplots will change all of the subplots at the same time.
g = sns.FacetGrid(df, row="department", col="variable", sharey='row')
g = g.map(plt.plot, "date", "value", marker='o', markersize=0.7)
xformatter = mdates.DateFormatter("%m/%d")
g.axes[0,0].xaxis.set_major_formatter(xformatter)

Plotting multiple columns after a groupBy in pandas

Given a df of the form
df = pd.DataFrame(
{
"date": [datetime.datetime(2018, 1, x) for x in range(1, 8)],
"label": ["A", "A", "B", "B", "C", "A", "C"],
"value1": [1, 22, 3, 4, 5, 6, 7],
"value2": [10, 4, 30, 5, 6, 8, 9]
}
)
df.set_index('date', inplace=True)
I'd like to have a single plot that contains all the 6 lines: the value of value1 and value2 for each of the groups. I browsed other answers, but I couldn't find how to properly do it. The best I have is
fig, ax = plt.subplots()
for label, df in grouped:
df.plot(ax=ax, label="Value for {}".format(label))
plt.legend()
which produces this result:
There are two problems here (prob the same):
I can't seem to control the label text
the label it is useless as it it now, because it is not informative
Any ideas?
Perhaps just use the axes to plot instead of the DataFrame and explicitly mention what you want to plot?
grouped = df.groupby('label')
fig, ax = plt.subplots()
for label, df2 in grouped:
ax.plot(df2['value1'], label=label+' value1')
ax.plot(df2['value2'], label=label+' value2')
plt.xticks(rotation=30)
plt.xlabel(df.index.name)
plt.legend()
Or if you don't want to write that out a lot of times, just specify which columns you want to plot ahead of time, and use another loop.
plot_vals = ['value1', 'value2']
fig, ax = plt.subplots()
for label, df2 in grouped:
for col in df2.columns[df2.columns.isin(plot_vals)]:
ax.plot(df2[col], label=label+ ' ' + col)
plt.xticks(rotation=30)
plt.xlabel(df.index.name)
plt.legend()