Plotting multiple columns after a groupBy in pandas - pandas

Given a df of the form
df = pd.DataFrame(
{
"date": [datetime.datetime(2018, 1, x) for x in range(1, 8)],
"label": ["A", "A", "B", "B", "C", "A", "C"],
"value1": [1, 22, 3, 4, 5, 6, 7],
"value2": [10, 4, 30, 5, 6, 8, 9]
}
)
df.set_index('date', inplace=True)
I'd like to have a single plot that contains all the 6 lines: the value of value1 and value2 for each of the groups. I browsed other answers, but I couldn't find how to properly do it. The best I have is
fig, ax = plt.subplots()
for label, df in grouped:
df.plot(ax=ax, label="Value for {}".format(label))
plt.legend()
which produces this result:
There are two problems here (prob the same):
I can't seem to control the label text
the label it is useless as it it now, because it is not informative
Any ideas?

Perhaps just use the axes to plot instead of the DataFrame and explicitly mention what you want to plot?
grouped = df.groupby('label')
fig, ax = plt.subplots()
for label, df2 in grouped:
ax.plot(df2['value1'], label=label+' value1')
ax.plot(df2['value2'], label=label+' value2')
plt.xticks(rotation=30)
plt.xlabel(df.index.name)
plt.legend()
Or if you don't want to write that out a lot of times, just specify which columns you want to plot ahead of time, and use another loop.
plot_vals = ['value1', 'value2']
fig, ax = plt.subplots()
for label, df2 in grouped:
for col in df2.columns[df2.columns.isin(plot_vals)]:
ax.plot(df2[col], label=label+ ' ' + col)
plt.xticks(rotation=30)
plt.xlabel(df.index.name)
plt.legend()

Related

How to align a legend relative to a GridSpec cell?

I am creating a figure like this:
fig = plt.figure(figsize = (7, 8))
outer_grid = gridspec.GridSpec(2, 1, height_ratios = [2, 1])
inner_grid1 = gridspec.GridSpecFromSubplotSpec(4, 3, subplot_spec=outer_grid[0])
inner_grid2 = gridspec.GridSpecFromSubplotSpec(2, 3, subplot_spec=outer_grid[1])
Now I would like to have one legend for all plots in inner_grid1 and a separate legend for all plots in inner_grid2. And I would like those legends to be placed nicely, even though they are higher than a single plot, and cannot have more than one column to not make the figure too wide.
Here is an example where I tried to align the legends with trial and error with method 2 below, however this took ages to make.
So I see three options to achieve this, none of which work:
Place the legend as part of an Axes object, but manually move it outside of the actual plot using axes.legend([...], bbox_to_anchor=(x, y)). This does not work when the legend is higher as a single plot, because it rescales the plots to fit the legend into its grid cell.
Place the legend globally on the Figure object. This works, but makes the correct placement really hard. I cannot use loc = "center right", since it centers it for the full figure instead of just the inner_grid1 or inner_grid2 plots.
Place the legend locally on the GridSpecFromSubplotSpec object. This would be perfect. However there is no method to create a legend on a GridSpecFromSubplotSpec or related classes, and the pyplot.legend method misses parameters to restrict the loc to parts of a grid.
Is there a way to place a legend as described?
As requested, a small code example generating something similar as desired.
This example uses method 2:
#!/usr/bin/env python3
import pandas as pd, seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
GENOMES = ["spneumoniae", "ecoliK12", "scerevisiae", "celegans", "bmori", "hg38"]
fig = plt.figure(figsize = (7, 8))
outer_grid = gridspec.GridSpec(2, 1, height_ratios = [2, 1])
inner_grid1 = gridspec.GridSpecFromSubplotSpec(4, 3, subplot_spec=outer_grid[0])
inner_grid2 = gridspec.GridSpecFromSubplotSpec(2, 3, subplot_spec=outer_grid[1])
# plots are in sets of six, 2 rows by 3 columns each
for index, genome in enumerate(GENOMES):
data = pd.DataFrame({"x": [0, 1, 2, 3, 0, 1, 2, 3], "y": [1, 0, 3, 2, 1, 0, 3, 2], "hue": ["a", "a", "a", "a", "b", "b", "b", "b"]})
# first set of six
ax1 = plt.Subplot(fig, inner_grid1[index])
ax1 = sns.lineplot(data = data, x = "x", y = "y", hue = "hue", ax = ax1)
ax1.set_xlabel("")
ax1.set_ylabel("")
if index == 2:
ax1.legend()
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc = "center left", title = "", bbox_to_anchor=(0.9, 2/3 - 0.03))
ax1.legend([], [], loc = "lower center", title = f"{genome}")
fig.add_subplot(ax1)
# second set of six
ax2 = plt.Subplot(fig, inner_grid1[index + 6])
ax2 = sns.lineplot(data = data, x = "x", y = "y", hue = "hue", ax = ax2)
ax2.set_xlabel("")
ax2.set_ylabel("")
ax2.legend([], [], loc = "upper center", title = f"{genome}")
fig.add_subplot(ax2)
#third set of six
ax3 = plt.Subplot(fig, inner_grid2[index])
ax3 = sns.lineplot(data = data, x = "x", y = "y", hue = "hue", ax = ax3)
ax3.set_xlabel("")
ax3.set_ylabel("")
if index == 2:
ax3.legend(["#unitigs", "avg. unitig len."])
handles, labels = ax3.get_legend_handles_labels()
fig.legend(handles, labels, loc = "center left", title = "", bbox_to_anchor=(0.9, 1/6 + 0.05))
ax3.legend([], [], loc = "upper center", title = f"{genome}")
fig.add_subplot(ax3)
plt.savefig("stackoverflow_test.pdf", bbox_inches="tight")

plotting 2 dictionaries in matplotlib

I have 2 dictionaries: dict1 = {'Beef':10, 'Poultry': 13, 'Pork': 14, 'Lamb': 11} and dict2 = {'Beef':3, 'Poultry': 1, 'Pork': 17, 'Lamb': 16}
I want to plot a double bar chart using the dictionary keys as the x-axis values, and the associated values on the y-axis. I am using matplotlib for this. does anyone have any information?
This part of the matplotlib documentation may what you are looking for. To plot your data, the x and y values need to be extracted from the dicts, for example via dict.keys() and dict.values().
import matplotlib.pyplot as plt
import numpy as np
dict1 = {'Beef':10, 'Poultry': 13, 'Pork': 14, 'Lamb': 11}
dict2 = {'Beef':3, 'Poultry': 1, 'Pork': 17, 'Lamb': 16}
x = dict1.keys()
y1 = dict1.values()
y2 = dict2.values()
N = len(x)
fig, ax = plt.subplots()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
p1 = ax.bar(ind, y1, width)
p2 = ax.bar(ind + width, y2, width)
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(x)
ax.legend((p1[0], p2[0]), ('dict1', 'dict2'))
plt.show()
Result:
I'd like to propose a more general approach: instead of just two dicts, what happens if we have a list of dictionaries?
In [89]: from random import randint, seed, shuffle
...: seed(20201213)
...: cats = 'a b c d e f g h i'.split() # categories
...: # List Of Dictionaries
...: lod = [{k:randint(5, 15) for k in shuffle(cats) or cats[:-2]} for _ in range(5)]
...: lod
Out[89]:
[{'d': 14, 'h': 10, 'i': 13, 'f': 13, 'c': 5, 'b': 5, 'a': 14},
{'h': 12, 'd': 5, 'c': 5, 'i': 11, 'b': 14, 'g': 8, 'e': 13},
{'d': 8, 'a': 12, 'f': 7, 'h': 10, 'g': 10, 'c': 11, 'i': 12},
{'g': 11, 'f': 8, 'i': 14, 'h': 11, 'a': 5, 'c': 7, 'b': 8},
{'e': 11, 'h': 13, 'c': 5, 'i': 8, 'd': 12, 'a': 11, 'g': 11}]
As you can see, the keys are not ordered in the same way and the dictionaries do not contain all the possible keys...
Our first step is to find a list of keys (lok), using a set comprehension, followed by sorting the keys (yes, we already know the keys, but here we are looking for a general solution…)
In [90]: lok = sorted(set(k for d in lod for k in d))
The number of elements in the two lists are
In [91]: nk, nd = len(lok), len(lod)
At this point we can compute the width of a single bar, saying that the bar groups are 1 unit apart (hence x = range(nk)) and that we leave 1/3 unit between the groups, we have
In [92]: x, w = range(nk), 0.67/nd
We are ready to go with the plot
In [93]: import matplotlib.pyplot as plt
...: for n, d in enumerate(lod):
...: plt.bar([ξ+n*w for ξ in x], [d.get(k, 0) for k in lok], w,
...: label='dict %d'%(n+1))
...: plt.xticks([ξ+w*nd/2 for ξ in x], lok)
...: plt.legend();
Let's write a small function
def plot_lod(lod, ws=0.33, ax=None, legend=True):
"""bar plot from the values in a list of dictionaries.
lod: list of dictionaries,
ws: optional, white space between groups of bars as a fraction of unity,
ax: optional, the Axes object to draw into,
legend: are we going to draw a legend?
Return: the Axes used to plot and a list of BarContainer objects."""
from matplotlib.pyplot import subplot
from numpy import arange, nan
if ax is None : ax = subplot()
lok = sorted({k for d in lod for k in d})
nk, nd = len(lok), len(lod)
x, w = arange(nk), (1.0-ws)/nd
lobars = [
ax.bar(x+n*w, [d.get(k, nan) for k in lok], w, label='%02d'%(n+1))
for n, d in enumerate(lod)
]
ax.set_xticks(x+w*nd/2-w/2)
ax.set_xticklabels(lok)
if legend : ax.legend()
return ax, lobars
Using the data of the previous example, we get a slightly different graph…

order plotly legend by custom order

I have a plotly figure to which I add values from two dataframes, df_a and df_b. I display both dataframes in different subplots, but they share a legend. How can I order the shared legend? The expected order is: [a, b, c, f, g]. Please see below for the current implementation - it seems to pick the order from input data in some way.
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
buckets = ["a", "b", "c", "f", "g"]
fig_subplots = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=["df_a", "df_b"])
def get_chart(df, buckets, d_legend):
fig = go.Figure()
colorscale = px.colors.qualitative.Pastel
i_color = 0
unique_dates = [str(dt) for dt in df.date.unique()]
for bucket in sorted(buckets, reverse=True):
y_values = df[df['variable'] == bucket]["value"].to_list()
enable_legend_for_bucket = False if bucket in d_legend else True
fig.add_trace(go.Bar(
name=bucket,
x=unique_dates,
y=y_values,
marker_color=colorscale[i_color],
legendgroup=bucket,
showlegend=enable_legend_for_bucket
))
if len(y_values) != 0:
d_legend[bucket] = True # store first time this bucket was populated for legend
i_color += 1
fig.update_layout(barmode="stack")
return fig, d_legend
list_df = [df_a.melt(id_vars="date"), df_b.melt(id_vars="date")]
d_legend = {}
iRow = 1
for df in list_df:
fig, d_legend = get_chart(df, buckets, d_legend)
for el in fig['data']:
fig_subplots.append_trace(el, iRow, 1)
iRow += 1
fig_subplots.update_layout(barmode='stack', legend={'traceorder':'normal'})
Without finding an easy built-in method to order the legend, and to have the same stacking of values with the legend sorted in any desired order.
new_order = sorted(['a','b','c','g','f'],reverse=True)
print(new_order)
ordered_object_list =[]
for i in new_order:
item = [obj for obj in fig_subplots.data if obj['name'] == i]
ordered_object_list += item
fig_subplots.data = ordered_object_list
fig_subplots.update_layout(legend={'traceorder':'reversed'})
There is also an approach with plotly express:
import pandas as pd
import plotly.express as px
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_a["id"] = "df_a"
df_b["id"] = "df_b"
df_c = df_a.append(df_b)
px.bar(df_c.melt(id_vars=["date", "id"]), x="date", y="value", color="variable", facet_col="id", range_y=[0,20], facet_col_wrap=1)

plot all columns of a pandas dataframe with matplotlib

I have a dataframe with a datetime-index and 65 columns.
And I want to plot all these colums among themselves,
not in a grid or in one figure.
df= test[['one', 'two', 'three','four']]
fig, ax = plt.subplots(figsize=(24, 5))
df.plot(ax=ax)
plt.show()
The example is all in one plot and not all among themselves.
You can loop over the columns of the DataFrame and create a new figure for each column. This will plot them all at once. If you want the next one to show up once the previous one is closed then put the call to plt.show() inside the for loop.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({
'one': [1, 3, 2, 5],
'two': [9, 6, 4, 3],
'three': [0, 1, 1, 0],
'four': [-3, 2, -1, 0]})
for i, col in enumerate(df.columns):
df[col].plot(fig=plt.figure(i))
plt.title(col)
plt.show()

Plotting stacked barchart with pandas of multiple columns grouped

I have two dataframes which I need to get the difference and then plot one of them on top of this difference. Here is a minimal example:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.DataFrame([[2,5,7,6,7],[4,4,4,4,3],[8,8,7,3,4],[16,10,12,13,16]], columns=["N", "A", "B", "C", "D"])
df2 = pd.DataFrame([[2,1,3,6,5],[4,1,2,3,2],[8,2,2,3,3],[16,8,10,3,11]], columns=["N", "A", "B", "C", "D"])
dfDiff = df1 - df2
dfDiff['N'] = df1['N']
# Individual barchart
colors = ['#6c8ebf', '#82b366', '#F7A01D', '#9876a7']
df1.set_index('N')[["A", "B", "C", "D"]].plot.bar(color=colors)
df2.set_index('N')[["A", "B", "C", "D"]].plot.bar(color=colors)
dfStacked = pd.DataFrame(columns=["N", "A", "A_diff", "B", "B_diff"])
dfStacked["N"] = df2["N"]
dfStacked["A"] = df2["A"]
dfStacked["B"] = df2["B"]
dfStacked["C"] = df2["C"]
dfStacked["D"] = df2["D"]
dfStacked["A_diff"] = dfDiff["A"]
dfStacked["B_diff"] = dfDiff["B"]
dfStacked["C_diff"] = dfDiff["C"]
dfStacked["D_diff"] = dfDiff["D"]
dfStacked.set_index('N').plot.bar(stacked=True)
plt.show()
The dataframes look like this:
The thing is that now the new stacked one ends up with everything merged. I want to have "A" stacked with "A_diff", "B", stacked with "B_diff", "C" stacked with "C_diff" and "D" stacked with "D_diff".
For example, I changed the code to do it with "A" and "A_diff" as dfStacked.set_index('N')[["A", "A_diff"]].plot.bar(stacked=True) which looks correct, but I want A,B,C and D grouped by N like in the first two figures.
Do I need a new dataframe for this, like dfStacked? If so, in which form should the content be added? And how can I keep the same colors but add hatch="/" only for the "top" stacked bar?
Would it be better to have the dataframe as below?:
df3 = pd.DataFrame(columns=["N", "Algorithm", "df1", "dfDiff"])
df3.loc[len(df3)] = [2, "A", 20, 10]
df3.loc[len(df3)] = [2, "A", 1, 4]
df3.loc[len(df3)] = [4, "A", 2, 3]
df3.loc[len(df3)] = [4, "A", 3, 4]
df3.loc[len(df3)] = [2, "B", 1, 3]
df3.loc[len(df3)] = [2, "B", 2, 4]
df3.loc[len(df3)] = [4, "B", 3, 3]
df3.loc[len(df3)] = [4, "B", 4, 2]
But how to group them by "N" and "Algorithm"? I mean, each row corresponds to one bar, just they should be grouped by "N" with all the "Algorithms" and the two last columns are the two "parts" of each bar. It would be good that the colors match the first two figures (for the "Algorithms") but the top part of the bar has hatch="/" for example.
Thanks for the help
I'll start from df1, df2 and get dfStacked in a slightly different way:
import pandas as pd
df1 = pd.DataFrame(
[
[2,5,7,6,7],
[4,4,4,4,3],
[8,8,7,3,4],
[16,10,12,13,16]
],
columns=["N", "A", "B", "C", "D"]
).set_index('N')
df2 = pd.DataFrame(
[
[2,1,3,6,5],
[4,1,2,3,2],
[8,2,2,3,3],
[16,8,10,3,11]
],
columns=["N", "A", "B", "C", "D"]
).set_index('N')
dfStacked = pd.concat(
[df1, df1-df2],
axis=1,
keys=['raw','diff']
).reorder_levels([1,0], axis=1)
Now we have this DataFrame:
To draw this data in a bar chart stacked by the first level we could make use of two DataFrame.plot's features - ax and bottom. The first one is the location of the axes where the barplot should be drawn, the second one is for the values where the bottom line of the bars should start. For details run help(plt.bar) to read about bottom and help(pd.DataFrame.plot) to read about ax.
import matplotlib.pyplot as plt
from matplotlib.colors import TABLEAU_COLORS
plt.figure(figsize=(10,7))
ax = plt.gca()
names = dfStacked.columns.levels[0]
n = len(names)
color = iter(TABLEAU_COLORS)
w = 1/(n+2) # width
h = '/'*5 # hatch for diff values
for i, name in enumerate(names):
c = next(color) # color
p = n/2 - i # position
dfStacked[name]['raw'].plot.bar(
ax=ax,
position=p,
width=w,
color=c,
label=f'{name} raw'
)
dfStacked[name]['diff'].plot.bar(
ax=ax,
bottom=dfStacked[name]['raw'],
hatch=h,
position=p,
width=w,
color=c,
label=f'{name} diff'
)
ax.set_xlim([-1, n])
ax.tick_params(axis='x', rotation=0)
ax.legend();
And here's the output:
IIUC, try using position parameter in pd.DataFrame.plot.bar:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df1 = pd.DataFrame([[2,5,7,6,7],[4,4,4,4,3],[8,8,7,3,4],[16,10,12,13,16]], columns=["N", "A", "B", "C", "D"])
df2 = pd.DataFrame([[2,1,3,6,5],[4,1,2,3,2],[8,2,2,3,3],[16,8,10,3,11]], columns=["N", "A", "B", "C", "D"])
dfDiff = df1 - df2
dfDiff['N'] = df1['N']
dfStacked = pd.DataFrame(columns=["N", "A", "A_diff", "B", "B_diff"])
dfStacked["N"] = df2["N"]
dfStacked["A"] = df2["A"]
dfStacked["B"] = df2["B"]
dfStacked["C"] = df2["C"]
dfStacked["D"] = df2["D"]
dfStacked["A_diff"] = dfDiff["A"]
dfStacked["B_diff"] = dfDiff["B"]
dfStacked["C_diff"] = dfDiff["C"]
dfStacked["D_diff"] = dfDiff["D"]
dfStacked = dfStacked.set_index('N')
colors = ['red', 'slateblue', 'lightseagreen', 'orange']
colors_c = ['darkred', 'blue', 'darkgreen', 'darkorange']
ax = dfStacked.filter(like='A').plot.bar(stacked=True, position=2, width=.1, color=[colors[0], colors_c[0]], edgecolor='w', alpha=.8)
dfStacked.filter(like='B').plot.bar(stacked=True, ax=ax, position=1, width=.1, color=[colors[1], colors_c[1]], edgecolor='w', alpha=.8)
dfStacked.filter(like='C').plot.bar(stacked=True, ax=ax, position=0, width=.1, color=[colors[2], colors_c[2]], edgecolor='w', alpha=.8)
dfStacked.filter(like='D').plot.bar(stacked=True, ax=ax, position=-1, width=.1, color=[colors[3], colors_c[3]], edgecolor='w', alpha=.8)
ax.set_xlim(-.5,3.5)
plt.legend(loc='upper center', ncol=4, bbox_to_anchor=(.5, 1.2))
plt.show()
Output: