order plotly legend by custom order - pandas

I have a plotly figure to which I add values from two dataframes, df_a and df_b. I display both dataframes in different subplots, but they share a legend. How can I order the shared legend? The expected order is: [a, b, c, f, g]. Please see below for the current implementation - it seems to pick the order from input data in some way.
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
buckets = ["a", "b", "c", "f", "g"]
fig_subplots = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=["df_a", "df_b"])
def get_chart(df, buckets, d_legend):
fig = go.Figure()
colorscale = px.colors.qualitative.Pastel
i_color = 0
unique_dates = [str(dt) for dt in df.date.unique()]
for bucket in sorted(buckets, reverse=True):
y_values = df[df['variable'] == bucket]["value"].to_list()
enable_legend_for_bucket = False if bucket in d_legend else True
fig.add_trace(go.Bar(
name=bucket,
x=unique_dates,
y=y_values,
marker_color=colorscale[i_color],
legendgroup=bucket,
showlegend=enable_legend_for_bucket
))
if len(y_values) != 0:
d_legend[bucket] = True # store first time this bucket was populated for legend
i_color += 1
fig.update_layout(barmode="stack")
return fig, d_legend
list_df = [df_a.melt(id_vars="date"), df_b.melt(id_vars="date")]
d_legend = {}
iRow = 1
for df in list_df:
fig, d_legend = get_chart(df, buckets, d_legend)
for el in fig['data']:
fig_subplots.append_trace(el, iRow, 1)
iRow += 1
fig_subplots.update_layout(barmode='stack', legend={'traceorder':'normal'})

Without finding an easy built-in method to order the legend, and to have the same stacking of values with the legend sorted in any desired order.
new_order = sorted(['a','b','c','g','f'],reverse=True)
print(new_order)
ordered_object_list =[]
for i in new_order:
item = [obj for obj in fig_subplots.data if obj['name'] == i]
ordered_object_list += item
fig_subplots.data = ordered_object_list
fig_subplots.update_layout(legend={'traceorder':'reversed'})

There is also an approach with plotly express:
import pandas as pd
import plotly.express as px
df_a = pd.DataFrame(columns=["date", "a", "b", "c"], data=[
[pd.to_datetime("31Jan20"), 3, 4, 5],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_b = pd.DataFrame(columns=["date", "a", "g", "f"], data=[
[pd.to_datetime("31Jan20"), 8, 5, 4],
[pd.to_datetime("31Mar20"), 3, 4, 5],
[pd.to_datetime("30Jun20"), 3, 4, 5],
])
df_a["id"] = "df_a"
df_b["id"] = "df_b"
df_c = df_a.append(df_b)
px.bar(df_c.melt(id_vars=["date", "id"]), x="date", y="value", color="variable", facet_col="id", range_y=[0,20], facet_col_wrap=1)

Related

plotting 2 dictionaries in matplotlib

I have 2 dictionaries: dict1 = {'Beef':10, 'Poultry': 13, 'Pork': 14, 'Lamb': 11} and dict2 = {'Beef':3, 'Poultry': 1, 'Pork': 17, 'Lamb': 16}
I want to plot a double bar chart using the dictionary keys as the x-axis values, and the associated values on the y-axis. I am using matplotlib for this. does anyone have any information?
This part of the matplotlib documentation may what you are looking for. To plot your data, the x and y values need to be extracted from the dicts, for example via dict.keys() and dict.values().
import matplotlib.pyplot as plt
import numpy as np
dict1 = {'Beef':10, 'Poultry': 13, 'Pork': 14, 'Lamb': 11}
dict2 = {'Beef':3, 'Poultry': 1, 'Pork': 17, 'Lamb': 16}
x = dict1.keys()
y1 = dict1.values()
y2 = dict2.values()
N = len(x)
fig, ax = plt.subplots()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
p1 = ax.bar(ind, y1, width)
p2 = ax.bar(ind + width, y2, width)
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(x)
ax.legend((p1[0], p2[0]), ('dict1', 'dict2'))
plt.show()
Result:
I'd like to propose a more general approach: instead of just two dicts, what happens if we have a list of dictionaries?
In [89]: from random import randint, seed, shuffle
...: seed(20201213)
...: cats = 'a b c d e f g h i'.split() # categories
...: # List Of Dictionaries
...: lod = [{k:randint(5, 15) for k in shuffle(cats) or cats[:-2]} for _ in range(5)]
...: lod
Out[89]:
[{'d': 14, 'h': 10, 'i': 13, 'f': 13, 'c': 5, 'b': 5, 'a': 14},
{'h': 12, 'd': 5, 'c': 5, 'i': 11, 'b': 14, 'g': 8, 'e': 13},
{'d': 8, 'a': 12, 'f': 7, 'h': 10, 'g': 10, 'c': 11, 'i': 12},
{'g': 11, 'f': 8, 'i': 14, 'h': 11, 'a': 5, 'c': 7, 'b': 8},
{'e': 11, 'h': 13, 'c': 5, 'i': 8, 'd': 12, 'a': 11, 'g': 11}]
As you can see, the keys are not ordered in the same way and the dictionaries do not contain all the possible keys...
Our first step is to find a list of keys (lok), using a set comprehension, followed by sorting the keys (yes, we already know the keys, but here we are looking for a general solution…)
In [90]: lok = sorted(set(k for d in lod for k in d))
The number of elements in the two lists are
In [91]: nk, nd = len(lok), len(lod)
At this point we can compute the width of a single bar, saying that the bar groups are 1 unit apart (hence x = range(nk)) and that we leave 1/3 unit between the groups, we have
In [92]: x, w = range(nk), 0.67/nd
We are ready to go with the plot
In [93]: import matplotlib.pyplot as plt
...: for n, d in enumerate(lod):
...: plt.bar([ξ+n*w for ξ in x], [d.get(k, 0) for k in lok], w,
...: label='dict %d'%(n+1))
...: plt.xticks([ξ+w*nd/2 for ξ in x], lok)
...: plt.legend();
Let's write a small function
def plot_lod(lod, ws=0.33, ax=None, legend=True):
"""bar plot from the values in a list of dictionaries.
lod: list of dictionaries,
ws: optional, white space between groups of bars as a fraction of unity,
ax: optional, the Axes object to draw into,
legend: are we going to draw a legend?
Return: the Axes used to plot and a list of BarContainer objects."""
from matplotlib.pyplot import subplot
from numpy import arange, nan
if ax is None : ax = subplot()
lok = sorted({k for d in lod for k in d})
nk, nd = len(lok), len(lod)
x, w = arange(nk), (1.0-ws)/nd
lobars = [
ax.bar(x+n*w, [d.get(k, nan) for k in lok], w, label='%02d'%(n+1))
for n, d in enumerate(lod)
]
ax.set_xticks(x+w*nd/2-w/2)
ax.set_xticklabels(lok)
if legend : ax.legend()
return ax, lobars
Using the data of the previous example, we get a slightly different graph…

plot a groupby object with bokeh

Consider the following MWE.
from pandas import DataFrame
from bokeh.plotting import figure
data = dict(x = [0,1,2,0,1,2],
y = [0,1,2,4,5,6],
g = [1,1,1,2,2,2])
df = DataFrame(data)
p = figure()
p.line( 'x', 'y', source=df[ df.g == 1 ] )
p.line( 'x', 'y', source=df[ df.g == 2 ] )
Ideally, I would like to compress the last to lines in one:
p.line( 'x', 'y', source=df.groupby('g') )
(Real life examples have a large and variable number of groups.) Is there any concise way to do this?
I just found out that the following works
gby = df.groupby('g')
gby.apply( lambda d: p.line( 'x', 'y', source=d ) )
(it has some drawbacks, though).
Any better idea?
I didn't come out with df.groupby so I used df.loc but maybe multi_line is what you are after:
from pandas import DataFrame
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
data = dict(x = [0, 1, 2, 0, 1, 2],
y = [0, 1, 2, 4, 5, 6],
g = [1, 1, 1, 2, 2, 2])
df = DataFrame(data, index = data['g'])
dfs = [DataFrame(df.loc[i].values, columns = df.columns) for i in df['g'].unique()]
source = ColumnDataSource(dict(x = [df['x'].values for df in dfs], y = [df['y'].values for df in dfs]))
p = figure()
p.multi_line('x', 'y', source = source)
show(p)
Result:
This is Tony's solution slightly simplified.
import pandas as pd
from bokeh.plotting import figure
data = dict(x = [0, 1, 2, 0, 1, 2],
y = [0, 1, 2, 4, 5, 6],
g = [1, 1, 1, 2, 2, 2])
df = pd.DataFrame(data)
####################### So far as in the OP
gby = df.groupby('g')
p = figure()
x = [list( sdf['x'] ) for i,sdf in gby]
y = [list( sdf['y'] ) for i,sdf in gby]
p.multi_line( x, y )
from pandas import DataFrame
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
data = dict(x = [0, 1, 2, 0, 1, 2],
y = [0, 1, 2, 4, 5, 6],
g = [1, 1, 1, 2, 2, 2])
df = DataFrame(data)
plt = figure()
for i, group in df.groupby(['g']):
source = ColumnDataSource(group)
plt.line('x','y', source=source, legend_group='g')
show(plt)

Plotting multiple columns after a groupBy in pandas

Given a df of the form
df = pd.DataFrame(
{
"date": [datetime.datetime(2018, 1, x) for x in range(1, 8)],
"label": ["A", "A", "B", "B", "C", "A", "C"],
"value1": [1, 22, 3, 4, 5, 6, 7],
"value2": [10, 4, 30, 5, 6, 8, 9]
}
)
df.set_index('date', inplace=True)
I'd like to have a single plot that contains all the 6 lines: the value of value1 and value2 for each of the groups. I browsed other answers, but I couldn't find how to properly do it. The best I have is
fig, ax = plt.subplots()
for label, df in grouped:
df.plot(ax=ax, label="Value for {}".format(label))
plt.legend()
which produces this result:
There are two problems here (prob the same):
I can't seem to control the label text
the label it is useless as it it now, because it is not informative
Any ideas?
Perhaps just use the axes to plot instead of the DataFrame and explicitly mention what you want to plot?
grouped = df.groupby('label')
fig, ax = plt.subplots()
for label, df2 in grouped:
ax.plot(df2['value1'], label=label+' value1')
ax.plot(df2['value2'], label=label+' value2')
plt.xticks(rotation=30)
plt.xlabel(df.index.name)
plt.legend()
Or if you don't want to write that out a lot of times, just specify which columns you want to plot ahead of time, and use another loop.
plot_vals = ['value1', 'value2']
fig, ax = plt.subplots()
for label, df2 in grouped:
for col in df2.columns[df2.columns.isin(plot_vals)]:
ax.plot(df2[col], label=label+ ' ' + col)
plt.xticks(rotation=30)
plt.xlabel(df.index.name)
plt.legend()

Plotting stacked barchart with pandas of multiple columns grouped

I have two dataframes which I need to get the difference and then plot one of them on top of this difference. Here is a minimal example:
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.DataFrame([[2,5,7,6,7],[4,4,4,4,3],[8,8,7,3,4],[16,10,12,13,16]], columns=["N", "A", "B", "C", "D"])
df2 = pd.DataFrame([[2,1,3,6,5],[4,1,2,3,2],[8,2,2,3,3],[16,8,10,3,11]], columns=["N", "A", "B", "C", "D"])
dfDiff = df1 - df2
dfDiff['N'] = df1['N']
# Individual barchart
colors = ['#6c8ebf', '#82b366', '#F7A01D', '#9876a7']
df1.set_index('N')[["A", "B", "C", "D"]].plot.bar(color=colors)
df2.set_index('N')[["A", "B", "C", "D"]].plot.bar(color=colors)
dfStacked = pd.DataFrame(columns=["N", "A", "A_diff", "B", "B_diff"])
dfStacked["N"] = df2["N"]
dfStacked["A"] = df2["A"]
dfStacked["B"] = df2["B"]
dfStacked["C"] = df2["C"]
dfStacked["D"] = df2["D"]
dfStacked["A_diff"] = dfDiff["A"]
dfStacked["B_diff"] = dfDiff["B"]
dfStacked["C_diff"] = dfDiff["C"]
dfStacked["D_diff"] = dfDiff["D"]
dfStacked.set_index('N').plot.bar(stacked=True)
plt.show()
The dataframes look like this:
The thing is that now the new stacked one ends up with everything merged. I want to have "A" stacked with "A_diff", "B", stacked with "B_diff", "C" stacked with "C_diff" and "D" stacked with "D_diff".
For example, I changed the code to do it with "A" and "A_diff" as dfStacked.set_index('N')[["A", "A_diff"]].plot.bar(stacked=True) which looks correct, but I want A,B,C and D grouped by N like in the first two figures.
Do I need a new dataframe for this, like dfStacked? If so, in which form should the content be added? And how can I keep the same colors but add hatch="/" only for the "top" stacked bar?
Would it be better to have the dataframe as below?:
df3 = pd.DataFrame(columns=["N", "Algorithm", "df1", "dfDiff"])
df3.loc[len(df3)] = [2, "A", 20, 10]
df3.loc[len(df3)] = [2, "A", 1, 4]
df3.loc[len(df3)] = [4, "A", 2, 3]
df3.loc[len(df3)] = [4, "A", 3, 4]
df3.loc[len(df3)] = [2, "B", 1, 3]
df3.loc[len(df3)] = [2, "B", 2, 4]
df3.loc[len(df3)] = [4, "B", 3, 3]
df3.loc[len(df3)] = [4, "B", 4, 2]
But how to group them by "N" and "Algorithm"? I mean, each row corresponds to one bar, just they should be grouped by "N" with all the "Algorithms" and the two last columns are the two "parts" of each bar. It would be good that the colors match the first two figures (for the "Algorithms") but the top part of the bar has hatch="/" for example.
Thanks for the help
I'll start from df1, df2 and get dfStacked in a slightly different way:
import pandas as pd
df1 = pd.DataFrame(
[
[2,5,7,6,7],
[4,4,4,4,3],
[8,8,7,3,4],
[16,10,12,13,16]
],
columns=["N", "A", "B", "C", "D"]
).set_index('N')
df2 = pd.DataFrame(
[
[2,1,3,6,5],
[4,1,2,3,2],
[8,2,2,3,3],
[16,8,10,3,11]
],
columns=["N", "A", "B", "C", "D"]
).set_index('N')
dfStacked = pd.concat(
[df1, df1-df2],
axis=1,
keys=['raw','diff']
).reorder_levels([1,0], axis=1)
Now we have this DataFrame:
To draw this data in a bar chart stacked by the first level we could make use of two DataFrame.plot's features - ax and bottom. The first one is the location of the axes where the barplot should be drawn, the second one is for the values where the bottom line of the bars should start. For details run help(plt.bar) to read about bottom and help(pd.DataFrame.plot) to read about ax.
import matplotlib.pyplot as plt
from matplotlib.colors import TABLEAU_COLORS
plt.figure(figsize=(10,7))
ax = plt.gca()
names = dfStacked.columns.levels[0]
n = len(names)
color = iter(TABLEAU_COLORS)
w = 1/(n+2) # width
h = '/'*5 # hatch for diff values
for i, name in enumerate(names):
c = next(color) # color
p = n/2 - i # position
dfStacked[name]['raw'].plot.bar(
ax=ax,
position=p,
width=w,
color=c,
label=f'{name} raw'
)
dfStacked[name]['diff'].plot.bar(
ax=ax,
bottom=dfStacked[name]['raw'],
hatch=h,
position=p,
width=w,
color=c,
label=f'{name} diff'
)
ax.set_xlim([-1, n])
ax.tick_params(axis='x', rotation=0)
ax.legend();
And here's the output:
IIUC, try using position parameter in pd.DataFrame.plot.bar:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df1 = pd.DataFrame([[2,5,7,6,7],[4,4,4,4,3],[8,8,7,3,4],[16,10,12,13,16]], columns=["N", "A", "B", "C", "D"])
df2 = pd.DataFrame([[2,1,3,6,5],[4,1,2,3,2],[8,2,2,3,3],[16,8,10,3,11]], columns=["N", "A", "B", "C", "D"])
dfDiff = df1 - df2
dfDiff['N'] = df1['N']
dfStacked = pd.DataFrame(columns=["N", "A", "A_diff", "B", "B_diff"])
dfStacked["N"] = df2["N"]
dfStacked["A"] = df2["A"]
dfStacked["B"] = df2["B"]
dfStacked["C"] = df2["C"]
dfStacked["D"] = df2["D"]
dfStacked["A_diff"] = dfDiff["A"]
dfStacked["B_diff"] = dfDiff["B"]
dfStacked["C_diff"] = dfDiff["C"]
dfStacked["D_diff"] = dfDiff["D"]
dfStacked = dfStacked.set_index('N')
colors = ['red', 'slateblue', 'lightseagreen', 'orange']
colors_c = ['darkred', 'blue', 'darkgreen', 'darkorange']
ax = dfStacked.filter(like='A').plot.bar(stacked=True, position=2, width=.1, color=[colors[0], colors_c[0]], edgecolor='w', alpha=.8)
dfStacked.filter(like='B').plot.bar(stacked=True, ax=ax, position=1, width=.1, color=[colors[1], colors_c[1]], edgecolor='w', alpha=.8)
dfStacked.filter(like='C').plot.bar(stacked=True, ax=ax, position=0, width=.1, color=[colors[2], colors_c[2]], edgecolor='w', alpha=.8)
dfStacked.filter(like='D').plot.bar(stacked=True, ax=ax, position=-1, width=.1, color=[colors[3], colors_c[3]], edgecolor='w', alpha=.8)
ax.set_xlim(-.5,3.5)
plt.legend(loc='upper center', ncol=4, bbox_to_anchor=(.5, 1.2))
plt.show()
Output:

Managing high dimensions in Numpy

I want to write a function of 4 variables : f(x1,x2,x3,x4), each in a different dimension.
This can be achieved by f(x1,x2[newaxis],x3[newaxis,newaxis],x4[newaxis,newaxis,newaxis]).
Do you know a smarter way ?
You're looking for np.ix_1:
f(*np.ix_(x1, x2, x3, x4))
For example:
>>> np.ix_([1, 2, 3], [4, 5])
(array([[1],
[2],
[3]]), array([[4, 5]]))
1Or equivalently, np.meshgrid(..., sparse=True, indexing='ij')
One way would be to reshape each array giving appropriate number of singleton dimensions along the leading axes. To do this across all arrays, we could use a list comprehension.
Thus, one way to handle generic number of input arrays would be -
L = [x1,x2,x3,x4]
out = [l.reshape([1]*i + [len(l)]) for i,l in enumerate(L)]
Sample run -
In [186]: # Initialize input arrays
...: x1 = np.random.randint(0,9,(4))
...: x2 = np.random.randint(0,9,(2))
...: x3 = np.random.randint(0,9,(5))
...: x4 = np.random.randint(0,9,(3))
...:
In [187]: A = x1,x2[None],x3[None,None],x4[None,None,None]
In [188]: L = [x1,x2,x3,x4]
...: out = [l.reshape([1]*i + [len(l)]) for i,l in enumerate(L)]
...:
In [189]: A
Out[189]:
(array([2, 1, 1, 1]),
array([[8, 2]]),
array([[[0, 3, 5, 8, 7]]]),
array([[[[6, 7, 0]]]]))
In [190]: out
Out[190]:
[array([2, 1, 1, 1]),
array([[8, 2]]),
array([[[0, 3, 5, 8, 7]]]),
array([[[[6, 7, 0]]]])]