Creating a stacked and grouped bar chart in pandas [duplicate] - matplotlib

So here is how my data set looks like :
In [1]: df1=pd.DataFrame(np.random.rand(4,2),index=["A","B","C","D"],columns=["I","J"])
In [2]: df2=pd.DataFrame(np.random.rand(4,2),index=["A","B","C","D"],columns=["I","J"])
In [3]: df1
Out[3]:
I J
A 0.675616 0.177597
B 0.675693 0.598682
C 0.631376 0.598966
D 0.229858 0.378817
In [4]: df2
Out[4]:
I J
A 0.939620 0.984616
B 0.314818 0.456252
C 0.630907 0.656341
D 0.020994 0.538303
I want to have stacked bar plot for each dataframe but since they have same index, I'd like to have 2 stacked bars per index.
I've tried to plot both on the same axes :
In [5]: ax = df1.plot(kind="bar", stacked=True)
In [5]: ax2 = df2.plot(kind="bar", stacked=True, ax = ax)
But it overlaps.
Then I tried to concat the two dataset first :
pd.concat(dict(df1 = df1, df2 = df2),axis = 1).plot(kind="bar", stacked=True)
but here everything is stacked
My best try is :
pd.concat(dict(df1 = df1, df2 = df2),axis = 0).plot(kind="bar", stacked=True)
Which gives :
This is basically what I want, except that I want the bar ordered as
(df1,A) (df2,A) (df1,B) (df2,B) etc...
I guess there is a trick but I can't found it !
After #bgschiller's answer I got this :
Which is almost what I want. I would like the bar to be clustered by index, in order to have something visually clear.
Bonus : Having the x-label not redundant, something like :
df1 df2 df1 df2
_______ _______ ...
A B

I eventually found a trick (edit: see below for using seaborn and longform dataframe):
Solution with pandas and matplotlib
Here it is with a more complete example :
import pandas as pd
import matplotlib.cm as cm
import numpy as np
import matplotlib.pyplot as plt
def plot_clustered_stacked(dfall, labels=None, title="multiple stacked bar plot", H="/", **kwargs):
"""Given a list of dataframes, with identical columns and index, create a clustered stacked bar plot.
labels is a list of the names of the dataframe, used for the legend
title is a string for the title of the plot
H is the hatch used for identification of the different dataframe"""
n_df = len(dfall)
n_col = len(dfall[0].columns)
n_ind = len(dfall[0].index)
axe = plt.subplot(111)
for df in dfall : # for each data frame
axe = df.plot(kind="bar",
linewidth=0,
stacked=True,
ax=axe,
legend=False,
grid=False,
**kwargs) # make bar plots
h,l = axe.get_legend_handles_labels() # get the handles we want to modify
for i in range(0, n_df * n_col, n_col): # len(h) = n_col * n_df
for j, pa in enumerate(h[i:i+n_col]):
for rect in pa.patches: # for each index
rect.set_x(rect.get_x() + 1 / float(n_df + 1) * i / float(n_col))
rect.set_hatch(H * int(i / n_col)) #edited part
rect.set_width(1 / float(n_df + 1))
axe.set_xticks((np.arange(0, 2 * n_ind, 2) + 1 / float(n_df + 1)) / 2.)
axe.set_xticklabels(df.index, rotation = 0)
axe.set_title(title)
# Add invisible data to add another legend
n=[]
for i in range(n_df):
n.append(axe.bar(0, 0, color="gray", hatch=H * i))
l1 = axe.legend(h[:n_col], l[:n_col], loc=[1.01, 0.5])
if labels is not None:
l2 = plt.legend(n, labels, loc=[1.01, 0.1])
axe.add_artist(l1)
return axe
# create fake dataframes
df1 = pd.DataFrame(np.random.rand(4, 5),
index=["A", "B", "C", "D"],
columns=["I", "J", "K", "L", "M"])
df2 = pd.DataFrame(np.random.rand(4, 5),
index=["A", "B", "C", "D"],
columns=["I", "J", "K", "L", "M"])
df3 = pd.DataFrame(np.random.rand(4, 5),
index=["A", "B", "C", "D"],
columns=["I", "J", "K", "L", "M"])
# Then, just call :
plot_clustered_stacked([df1, df2, df3],["df1", "df2", "df3"])
And it gives that :
You can change the colors of the bar by passing a cmap argument:
plot_clustered_stacked([df1, df2, df3],
["df1", "df2", "df3"],
cmap=plt.cm.viridis)
Solution with seaborn:
Given the same df1, df2, df3, below, I convert them in a long form:
df1["Name"] = "df1"
df2["Name"] = "df2"
df3["Name"] = "df3"
dfall = pd.concat([pd.melt(i.reset_index(),
id_vars=["Name", "index"]) # transform in tidy format each df
for i in [df1, df2, df3]],
ignore_index=True)
The problem with seaborn is that it doesn't stack bars natively, so the trick is to plot the cumulative sum of each bar on top of each other:
dfall.set_index(["Name", "index", "variable"], inplace=1)
dfall["vcs"] = dfall.groupby(level=["Name", "index"]).cumsum()
dfall.reset_index(inplace=True)
>>> dfall.head(6)
Name index variable value vcs
0 df1 A I 0.717286 0.717286
1 df1 B I 0.236867 0.236867
2 df1 C I 0.952557 0.952557
3 df1 D I 0.487995 0.487995
4 df1 A J 0.174489 0.891775
5 df1 B J 0.332001 0.568868
Then loop over each group of variable and plot the cumulative sum:
c = ["blue", "purple", "red", "green", "pink"]
for i, g in enumerate(dfall.groupby("variable")):
ax = sns.barplot(data=g[1],
x="index",
y="vcs",
hue="Name",
color=c[i],
zorder=-i, # so first bars stay on top
edgecolor="k")
ax.legend_.remove() # remove the redundant legends
It lacks the legend that can be added easily I think. The problem is that instead of hatches (which can be added easily) to differentiate the dataframes we have a gradient of lightness, and it's a bit too light for the first one, and I don't really know how to change that without changing each rectangle one by one (as in the first solution).
Tell me if you don't understand something in the code.
Feel free to re-use this code which is under CC0.

This is a great start but I think the colors could be modified a bit for clarity. Also be careful about importing every argument in Altair as this may cause collisions with existing objects in your namespace. Here is some reconfigured code to display the correct color display when stacking the values:
Import packages
import pandas as pd
import numpy as np
import altair as alt
Generate some random data
df1=pd.DataFrame(10*np.random.rand(4,3),index=["A","B","C","D"],columns=["I","J","K"])
df2=pd.DataFrame(10*np.random.rand(4,3),index=["A","B","C","D"],columns=["I","J","K"])
df3=pd.DataFrame(10*np.random.rand(4,3),index=["A","B","C","D"],columns=["I","J","K"])
def prep_df(df, name):
df = df.stack().reset_index()
df.columns = ['c1', 'c2', 'values']
df['DF'] = name
return df
df1 = prep_df(df1, 'DF1')
df2 = prep_df(df2, 'DF2')
df3 = prep_df(df3, 'DF3')
df = pd.concat([df1, df2, df3])
Plot data with Altair
alt.Chart(df).mark_bar().encode(
# tell Altair which field to group columns on
x=alt.X('c2:N', title=None),
# tell Altair which field to use as Y values and how to calculate
y=alt.Y('sum(values):Q',
axis=alt.Axis(
grid=False,
title=None)),
# tell Altair which field to use to use as the set of columns to be represented in each group
column=alt.Column('c1:N', title=None),
# tell Altair which field to use for color segmentation
color=alt.Color('DF:N',
scale=alt.Scale(
# make it look pretty with an enjoyable color pallet
range=['#96ceb4', '#ffcc5c','#ff6f69'],
),
))\
.configure_view(
# remove grid lines around column clusters
strokeOpacity=0
)

I have managed to do the same using pandas and matplotlib subplots with basic commands.
Here's an example:
fig, axes = plt.subplots(nrows=1, ncols=3)
ax_position = 0
for concept in df.index.get_level_values('concept').unique():
idx = pd.IndexSlice
subset = df.loc[idx[[concept], :],
['cmp_tr_neg_p_wrk', 'exp_tr_pos_p_wrk',
'cmp_p_spot', 'exp_p_spot']]
print(subset.info())
subset = subset.groupby(
subset.index.get_level_values('datetime').year).sum()
subset = subset / 4 # quarter hours
subset = subset / 100 # installed capacity
ax = subset.plot(kind="bar", stacked=True, colormap="Blues",
ax=axes[ax_position])
ax.set_title("Concept \"" + concept + "\"", fontsize=30, alpha=1.0)
ax.set_ylabel("Hours", fontsize=30),
ax.set_xlabel("Concept \"" + concept + "\"", fontsize=30, alpha=0.0),
ax.set_ylim(0, 9000)
ax.set_yticks(range(0, 9000, 1000))
ax.set_yticklabels(labels=range(0, 9000, 1000), rotation=0,
minor=False, fontsize=28)
ax.set_xticklabels(labels=['2012', '2013', '2014'], rotation=0,
minor=False, fontsize=28)
handles, labels = ax.get_legend_handles_labels()
ax.legend(['Market A', 'Market B',
'Market C', 'Market D'],
loc='upper right', fontsize=28)
ax_position += 1
# look "three subplots"
#plt.tight_layout(pad=0.0, w_pad=-8.0, h_pad=0.0)
# look "one plot"
plt.tight_layout(pad=0., w_pad=-16.5, h_pad=0.0)
axes[1].set_ylabel("")
axes[2].set_ylabel("")
axes[1].set_yticklabels("")
axes[2].set_yticklabels("")
axes[0].legend().set_visible(False)
axes[1].legend().set_visible(False)
axes[2].legend(['Market A', 'Market B',
'Market C', 'Market D'],
loc='upper right', fontsize=28)
The dataframe structure of "subset" before grouping looks like this:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 105216 entries, (D_REC, 2012-01-01 00:00:00) to (D_REC, 2014-12-31 23:45:00)
Data columns (total 4 columns):
cmp_tr_neg_p_wrk 105216 non-null float64
exp_tr_pos_p_wrk 105216 non-null float64
cmp_p_spot 105216 non-null float64
exp_p_spot 105216 non-null float64
dtypes: float64(4)
memory usage: 4.0+ MB
and the plot like this:
It is formatted in the "ggplot" style with the following header:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

The answer by #jrjc for use of seaborn is very clever, but it has a few problems, as noted by the author:
The "light" shading is too pale when only two or three categories are needed. It makes colour series (pale blue, blue, dark blue, etc.) difficult to distinguish.
The legend is not produced to distinguish the meaning of the shadings ("pale" means what?)
More importantly, however, I found out that, because of the groupbystatement in the code:
This solution works only if the columns are ordered alphabetically. If I rename columns ["I", "J", "K", "L", "M"] by something anti-alphabetical (["zI", "yJ", "xK", "wL", "vM"]), I get this graph instead:
I strove to resolve these problems with the plot_grouped_stackedbars() function in this open-source python module.
It keeps the shading within reasonable range
It auto-generates a legend that explains the shading
It does not rely on groupby
It also allows for
various normalization options (see below normalization to 100% of maximum value)
the addition of error bars
See full demo here. I hope this proves useful and can answer the original question.

Here is a more succinct implementation of the answer from Cord Kaldemeyer. The idea is to reserve as much width as necessary for the plots. Then each cluster gets a subplot of the required length.
# Data and imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator
import matplotlib.gridspec as gridspec
import matplotlib
matplotlib.style.use('ggplot')
np.random.seed(0)
df = pd.DataFrame(np.asarray(1+5*np.random.random((10,4)), dtype=int),columns=["Cluster", "Bar", "Bar_part", "Count"])
df = df.groupby(["Cluster", "Bar", "Bar_part"])["Count"].sum().unstack(fill_value=0)
display(df)
# plotting
clusters = df.index.levels[0]
inter_graph = 0
maxi = np.max(np.sum(df, axis=1))
total_width = len(df)+inter_graph*(len(clusters)-1)
fig = plt.figure(figsize=(total_width,10))
gridspec.GridSpec(1, total_width)
axes=[]
ax_position = 0
for cluster in clusters:
subset = df.loc[cluster]
ax = subset.plot(kind="bar", stacked=True, width=0.8, ax=plt.subplot2grid((1,total_width), (0,ax_position), colspan=len(subset.index)))
axes.append(ax)
ax.set_title(cluster)
ax.set_xlabel("")
ax.set_ylim(0,maxi+1)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax_position += len(subset.index)+inter_graph
for i in range(1,len(clusters)):
axes[i].set_yticklabels("")
axes[i-1].legend().set_visible(False)
axes[0].set_ylabel("y_label")
fig.suptitle('Big Title', fontsize="x-large")
legend = axes[-1].legend(loc='upper right', fontsize=16, framealpha=1).get_frame()
legend.set_linewidth(3)
legend.set_edgecolor("black")
plt.show()
The result is the following:

We tried to do this just using matplotlib. We converted the values to cumulative values as shown below:
# get cumulative values
cum_val = [a[0]]
for j in range(1,len(a)):
cum_val.append( cum_val[j-1] + a[j] )
We then plotted bars in descending order of height so that they are all visible. We added some hard-coded color schemes as well as it can generated sequentially from the RGB cube. The package can be installed with
pip install groupstackbar
Then, it can be imported as used as shown below. Also, there is a function (generate_dummy_data) to generate a dummy.csv sample data in order to test the functionality.
import matplotlib.pyplot as plt
import csv
import random
import groupstackbar
def generate_dummy_data():
with open('dummy_data.csv','w') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['Week','State_SEIR','Age_Cat','Value'])
for i in ['Week 1', 'Week 2', 'Week 3']: # 3 weeks
for j in ['S','E','I','R']:
for k in ['Age Cat 1', 'Age Cat 2', 'Age Cat 3', 'Age Cat 4', 'Age Cat 5']:
csvwriter.writerow([i,j,k, int(random.random()*100)])
generate_dummy_data()
f = groupstackbar.plot_grouped_stacks('dummy_data.csv', BGV=['State_SEIR','Week','Age_Cat'], extra_space_on_top = 30)
plt.savefig("output.png",dpi=500)
The plot_grouped_stacks() function of groupstackbar is reproduced below:
"""
Arguments:
filename:
a csv filename with 4 headers, H1, H2, H3 and H4. Each one of H1/H2/H3/H4 are strings.
the first three headers(H1/H2/H3) should identify a row uniquely
the fourth header H4 contains the value (H4 must be integer or floating; cannot be a string)
.csv files without headers will result in the first row being read as headers.
duplicates (relevant for csv inputs):
duplicate entries imply two rows with same <H1/H2/H3> identifier.
In case of duplicates aggregation is performed before proceeding, both the duplicates are binned together to increase the target value
BGV:a python list of three headers in order for stacking (Bars, Groups and Vertical Stacking)
for example, if BGV=[H2, H1, H3], the group stack plot will be such that:
maximum number of bars = number of unique values under column H2
maximum number of bars grouped together horizontally(side-by-side) = number of
unique values under column H1
maximum number of vertical stacks in any bar = number of unique values under column H2
"""
def plot_grouped_stacks(filename, BGV, fig_size=(10, 8),
intra_group_spacing=0.1,
inter_group_spacing=10,
y_loc_for_group_name=-5,
y_loc_for_hstack_name=5,
fontcolor_hstacks='blue',
fontcolor_groups='black',
fontsize_hstacks=20,
fontsize_groups=30,
x_trim_hstack_label=0,
x_trim_group_label=0,
extra_space_on_top=20
):
figure_ = plt.figure(figsize=fig_size)
size = figure_.get_size_inches()
figure_.add_subplot(1,1,1)
# sanity check for inputs; some trivial exception handlings
if intra_group_spacing >= 100:
print ("Percentage for than 100 for variables intra_group_spacing, Aborting! ")
return
else:
intra_group_spacing = intra_group_spacing*size[0]/100 # converting percentanges to inches
if inter_group_spacing >= 100:
print ("Percentage for than 100 for variables inter_group_spacing, Aborting! ")
return
else:
inter_group_spacing = inter_group_spacing*size[0]/100 # converting percentanges to inches
if y_loc_for_group_name >= 100:
print ("Percentage for than 100 for variables inter_group_spacing, Aborting! ")
return
else:
# the multiplier 90 is set empirically to roughly align the percentage value
# <this is a quick fix solution, which needs to be improved later>
y_loc_for_group_name = 90*y_loc_for_group_name*size[1]/100 # converting percentanges to inches
if y_loc_for_hstack_name >= 100:
print ("Percentage for than 100 for variables inter_group_spacing, Aborting! ")
return
else:
y_loc_for_hstack_name = 70*y_loc_for_hstack_name*size[1]/100 # converting percentanges to inches
if x_trim_hstack_label >= 100:
print ("Percentage for than 100 for variables inter_group_spacing, Aborting! ")
return
else:
x_trim_hstack_label = x_trim_hstack_label*size[0]/100 # converting percentanges to inches
if x_trim_group_label >= 100:
print ("Percentage for than 100 for variables inter_group_spacing, Aborting! ")
return
else:
x_trim_group_label = x_trim_group_label*size[0]/100 # converting percentanges to inches
fileread_list = []
with open(filename) as f:
for row in f:
r = row.strip().split(',')
if len(r) != 4:
print ('4 items not found # line ', c, ' of ', filename)
return
else:
fileread_list.append(r)
# inputs:
bar_variable = BGV[0]
group_variable = BGV[1]
vertical_stacking_variable = BGV[2]
first_line = fileread_list[0]
for i in range(4):
if first_line[i] == vertical_stacking_variable:
header_num_Of_vertical_stacking = i
break
sorted_order_for_stacking = []
for listed in fileread_list[1:]: # skipping the first line
sorted_order_for_stacking.append(listed[header_num_Of_vertical_stacking])
sorted_order_for_stacking = list(set(sorted_order_for_stacking))
list.sort(sorted_order_for_stacking)
sorted_order_for_stacking_V = list(sorted_order_for_stacking)
#####################
first_line = fileread_list[0]
for i in range(4):
if first_line[i] == bar_variable:
header_num_Of_bar_Variable = i
break
sorted_order_for_stacking = []
for listed in fileread_list[1:]: # skipping the first line
sorted_order_for_stacking.append(listed[header_num_Of_bar_Variable])
sorted_order_for_stacking = list(set(sorted_order_for_stacking))
list.sort(sorted_order_for_stacking)
sorted_order_for_stacking_H = list(sorted_order_for_stacking)
######################
first_line = fileread_list[0]
for i in range(4):
if first_line[i] == group_variable:
header_num_Of_bar_Variable = i
break
sorted_order_for_stacking = []
for listed in fileread_list[1:]: # skipping the first line
sorted_order_for_stacking.append(listed[header_num_Of_bar_Variable])
sorted_order_for_stacking = list(set(sorted_order_for_stacking))
list.sort(sorted_order_for_stacking)
sorted_order_for_stacking_G = list(sorted_order_for_stacking)
#########################
print (" Vertical/Horizontal/Groups ")
print (sorted_order_for_stacking_V, " : Vertical stacking labels")
print (sorted_order_for_stacking_H, " : Horizontal stacking labels")
print (sorted_order_for_stacking_G, " : Group names")
# +1 because we need one space before and after as well
each_group_width = (size[0] - (len(sorted_order_for_stacking_G) + 1) *
inter_group_spacing)/len(sorted_order_for_stacking_G)
# -1 because we need n-1 spaces between bars if there are n bars in each group
each_bar_width = (each_group_width - (len(sorted_order_for_stacking_H) - 1) *
intra_group_spacing)/len(sorted_order_for_stacking_H)
# colormaps
number_of_color_maps_needed = len(sorted_order_for_stacking_H)
number_of_levels_in_each_map = len(sorted_order_for_stacking_V)
c_map_vertical = {}
for i in range(number_of_color_maps_needed):
try:
c_map_vertical[sorted_order_for_stacking_H[i]] = sequential_colors[i]
except:
print ("Something went wrong with hardcoded colors!\n reverting to custom colors (linear in RGB) ")
c_map_vertical[sorted_order_for_stacking_H[i]] = getColorMaps(N = number_of_levels_in_each_map, type = 'S')
##
state_num = -1
max_bar_height = 0
for state in sorted_order_for_stacking_H:
state_num += 1
week_num = -1
for week in ['Week 1', 'Week 2','Week 3']:
week_num += 1
a = [0] * len(sorted_order_for_stacking_V)
for i in range(len(sorted_order_for_stacking_V)):
for line_num in range(1,len(fileread_list)): # skipping the first line
listed = fileread_list[line_num]
if listed[1] == state and listed[0] == week and listed[2] == sorted_order_for_stacking_V[i]:
a[i] = (float(listed[3]))
# get cumulative values
cum_val = [a[0]]
for j in range(1,len(a)):
cum_val.append( cum_val[j-1] + a[j] )
max_bar_height = max([max_bar_height, max(cum_val)])
plt.text(x= (week_num)*(each_group_width+inter_group_spacing) - x_trim_group_label
, y=y_loc_for_group_name, s=sorted_order_for_stacking_G[week_num], fontsize=fontsize_groups, color=fontcolor_groups)
# state labels need to be printed just once for each week, hence putting them outside the loop
plt.text(x= week_num*(each_group_width+inter_group_spacing) + (state_num)*(each_bar_width+intra_group_spacing) - x_trim_hstack_label
, y=y_loc_for_hstack_name, s=sorted_order_for_stacking_H[state_num], fontsize=fontsize_hstacks, color = fontcolor_hstacks)
if week_num == 1:
# label only in the first week
for i in range(len(sorted_order_for_stacking_V)-1,-1,-1):
# trick to make them all visible: Plot in descending order of their height!! :)
plt.bar( week_num*(each_group_width+inter_group_spacing) +
state_num*(each_bar_width+intra_group_spacing),
height=cum_val[i] ,
width=each_bar_width,
color=c_map_vertical[state][i],
label= state + "_" + sorted_order_for_stacking_V[i] )
else:
# no label after the first week, (as it is just repetition)
for i in range(len(sorted_order_for_stacking_V)-1,-1,-1):
plt.bar( week_num*(each_group_width+inter_group_spacing) +
state_num*(each_bar_width+intra_group_spacing),
height=cum_val[i] ,
width=each_bar_width,
color=c_map_vertical[state][i])
plt.ylim(0,max_bar_height*(1+extra_space_on_top/100))
plt.tight_layout()
plt.xticks([], [])
plt.legend(ncol=len(sorted_order_for_stacking_H))
return figure_
A pictorial readMe is attached to help the user quickly figure out the parameters to the function. Please feel free to raise an issue or start a pull request. Currently the input format is .csv files with 4 columns, but pandas data frame input can be added if necessary.
https://github.com/jimioke/groupstackbar

You're on the right track! In order to change the order of the bars, you should change the order in the index.
In [5]: df_both = pd.concat(dict(df1 = df1, df2 = df2),axis = 0)
In [6]: df_both
Out[6]:
I J
df1 A 0.423816 0.094405
B 0.825094 0.759266
C 0.654216 0.250606
D 0.676110 0.495251
df2 A 0.607304 0.336233
B 0.581771 0.436421
C 0.233125 0.360291
D 0.519266 0.199637
[8 rows x 2 columns]
So we want to swap axes, then reorder. Here's an easy way to do this
In [7]: df_both.swaplevel(0,1)
Out[7]:
I J
A df1 0.423816 0.094405
B df1 0.825094 0.759266
C df1 0.654216 0.250606
D df1 0.676110 0.495251
A df2 0.607304 0.336233
B df2 0.581771 0.436421
C df2 0.233125 0.360291
D df2 0.519266 0.199637
[8 rows x 2 columns]
In [8]: df_both.swaplevel(0,1).sort_index()
Out[8]:
I J
A df1 0.423816 0.094405
df2 0.607304 0.336233
B df1 0.825094 0.759266
df2 0.581771 0.436421
C df1 0.654216 0.250606
df2 0.233125 0.360291
D df1 0.676110 0.495251
df2 0.519266 0.199637
[8 rows x 2 columns]
If it's important that your horizontal labels show up in the old order (df1,A) rather than (A,df1), we can just swaplevels again and not sort_index:
In [9]: df_both.swaplevel(0,1).sort_index().swaplevel(0,1)
Out[9]:
I J
df1 A 0.423816 0.094405
df2 A 0.607304 0.336233
df1 B 0.825094 0.759266
df2 B 0.581771 0.436421
df1 C 0.654216 0.250606
df2 C 0.233125 0.360291
df1 D 0.676110 0.495251
df2 D 0.519266 0.199637
[8 rows x 2 columns]

Altair can be helpful here. Here is the produced plot.
Imports
import pandas as pd
import numpy as np
from altair import *
Dataset creation
df1=pd.DataFrame(10*np.random.rand(4,2),index=["A","B","C","D"],columns=["I","J"])
df2=pd.DataFrame(10*np.random.rand(4,2),index=["A","B","C","D"],columns=["I","J"])
Preparing dataset
def prep_df(df, name):
df = df.stack().reset_index()
df.columns = ['c1', 'c2', 'values']
df['DF'] = name
return df
df1 = prep_df(df1, 'DF1')
df2 = prep_df(df2, 'DF2')
df = pd.concat([df1, df2])
Altair plot
Chart(df).mark_bar().encode(y=Y('values', axis=Axis(grid=False)),
x='c2:N',
column=Column('c1:N') ,
color='DF:N').configure_facet_cell( strokeWidth=0.0).configure_cell(width=200, height=200)

Here is how I did with two charts including data replication.
Initial Data:
A B C D
0 level1 B1 456 326
1 level1 B3 694 1345
2 level1 B2 546 1471
3 level2 B1 687 806
4 level2 B3 877 1003
5 level2 B2 790 1004
Set multi index
data = data.set_index(["A", "B"])
Here is the code:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
matplotlib.style.use("seaborn-white")
ig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,6))
ax_position = 0
y_offset = -120 # decrease value if you want to decrease the position of data labels
for metric in data.index.get_level_values('A').unique():
idx = pd.IndexSlice
subset = data.loc[idx[[metric], :],
['C', 'D']]
subset = subset.groupby(
subset.index.get_level_values('B')).sum()
ax = subset.plot(kind="bar", stacked=True, colormap="Pastel1",
ax=axes[ax_position])
ax.set_title(metric, fontsize=15, alpha=1.0)
ax.set_xlabel(metric, fontsize=15, alpha=0.0)
ax.set_ylabel("Values", fontsize=15)
ax.set_xticklabels(labels=['B1', "B2", "B3"], rotation=0,
minor=False, fontsize=15)
ax.set_ylim(0, 3000)
ax.set_yticks(range(0, 3000, 500), fontsize=15)
handles, labels = ax.get_legend_handles_labels()
ax_position += 1
for bar in ax.patches:
ax.text(
# Put the text in the middle of each bar. get_x returns the start
# so we add half the width to get to the middle.
bar.get_x() + bar.get_width() / 2,
# Vertically, add the height of the bar to the start of the bar,
# along with the offset.
bar.get_height() + bar.get_y() + y_offset,
# This is actual value we'll show.
round(bar.get_height()),
# Center the labels and style them a bit.
ha='center',
color='w',
weight='bold',
size=12
)
ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
plt.tight_layout(pad=0.0, w_pad=-1.0, h_pad=0.0) # increase w_pad if you'd like to separate charts
axes[1].set_yticklabels("")
axes[1].set_ylabel("")
axes[0].legend().set_visible(False)

You can change the bar order by altering the index order (using sort in this case):
pd.concat([df1, df2], keys=['df1', 'df2']).sort_index(level=1).plot.bar(stacked=True)

Related

Name stacked bars after legend entry on Pandas/Matplotlib

I have a stacked bar chart that works really well for what I'm looking for. My problem is handling the labels.
I can label every single stacked bar after its value (number), but I'm looking to label it after its name (on the legend).
Does anyone have an idea on how to solve this?
ps.: Unfortunately I can't post images yet.
I have something like this:
####
#15#
####
oooo ####
oooo #35#
o55o ####
oooo ####
oooo o12o
And need like this:
####
#### A
####
oooo ####
oooo B #### A
oooo ####
oooo oooo B
I've written a short example, see the code below:
import numpy as np
import matplotlib.pyplot as plt
# Some data
x = np.array([0, 1, 2])
y1 = np.array([3, 4, 1])
y2 = np.array([2, 2, 4])
# label text
label_y1 = 'y1'
label_y2 = 'y2'
# Create the base plot
fig, ax = plt.subplots()
bars_y1 = ax.bar(x, y1, width=0.5, label=label_y1)
bars_y2 = ax.bar(x, y2, width=0.5, label=label_y2, bottom=y1)
# Function to add labels to the plot
def add_labels(ax, bars, label):
for bar in bars:
# Get the desired x and y locations
xloc = bar.get_x() + 1.05 * bar.get_width()
yloc = bar.get_y() + bar.get_height() / 2
ax.annotate(label, xy=(xloc, yloc), va='center', ha='left', color=bar.get_facecolor())
# Add the labels in the plot
add_labels(ax, bars_y1, label_y1)
add_labels(ax, bars_y2, label_y2)
plt.show()
First of all, I generate some dummy data (x, y1 and y2). Then, I define the desired label text (label_y1 and label_y2) and lastly I make the base bar graph using Axes.bar. Note that I store the return value from the Axes.bar calls, which is a container containing all the bars!
Now, we get to the interesting part. I define a function called add_labels. As an input, it takes the Axes of interest, a container with all the bars and the desired label text. In the function body, I loop over all the bars and determine the desired x and y location for the label text. Using these values, I place label text at those coordinates using the Axes.annotate method. At the end of the script, I simply call the add_labels function with the desired arguments to get the following output:
Is this what you are looking for?
Based on Dex answer I came up with a solution.
Using patches, it will get every single bar from the chart. The bars are ordenated by rows. So if you have a 4x3 dataframe:
zero um dois
0 a b c
1 d e f
2 g h i
3 j k l
bars.patches will have each column after the other: [a,d,g,j,b,e,h,k,c,f,i,l]
So, every 4 items (rows), it restarts. To do that, we can use the the mod function (%) based on the number of rows on the df:
i % len(df.index) == 0 #moves position counter to the next column name
The code ended up like this:
import pandas as pd
import numpy as np
# Some data
x = np.array(['zero', 'um', 'dois'])
y = np.array([[3, 4, 8],[2, 2, 4],[6, 7, 8]])
df = pd.DataFrame(y, columns = x)
print(df)
zero um dois
0 3 4 8
1 2 2 4
2 6 7 8
title = 'Chart Title'
bars = df.plot.bar(ax = ax, stacked = True, title = title, legend = False)
plt.xlabel('x axis label')
pos = -1
for i, bar in enumerate(bars.patches): #runs through every single bar on the chart
if i % len(df.index) == 0: #based on lenght of the index, gets which label
pos += 1 #to use from the columns. Returning to the
#first after completing a row
xloc = bar.get_x()
yloc = bar.get_y() + bar.get_height() / 2
if bar.get_height() > 30:
ax.annotate(str(df.columns[pos]), xy = (xloc, yloc), va='center', ha='left')
#df.columns[pos] will get the correct column name
So, no matter the size of the dataframe, it will plot the column names next to the bars
chart example:
https://i.stack.imgur.com/2iHau.png

Plotting annual mean and standard deviation in different colors for each year

I have data for several years. I have calculated mean and standard deviation for each year. Now I want to plot each row with mean as a scatter plot and fill plot between the standard deviations that is mean plus minus standard deviation in different colors for different years.
After using df_wc.set_index('Date').resample('Y')["Ratio(a/w)"].mean() it returns only the last date of the year (as shown below in the data set) but I want the fill plot for standard deviation to spread for the entire year.
Sample Data set:
Date | Mean | Std_dv
1858-12-31 1.284273 0.403052
1859-12-31 1.235267 0.373283
1860-12-31 1.093308 0.183646
1861-12-31 1.403693 0.400722
That's a very good question that you have asked, and it did not have an easy answer. But if I had understood the problem correctly, you need a fill plot with different colours for each year. The upper bound and lower bound of the plot will be between mean + std and mean - std?
So, I formed a custom time series and this is how I have plotted the values with the upper bound and lower bounds:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection,PatchCollection
from matplotlib.colors import ListedColormap, BoundaryNorm
import pandas as pd
ts = range(10)
num_classes = len(ts)
df = pd.DataFrame(data={'TOTAL': np.random.rand(len(ts)), 'Label': list(range(0, num_classes))}, index=ts)
df['UB'] = df['TOTAL'] + 2
df['LB'] = df['TOTAL'] - 2
print(df)
colors = ['r', 'g', 'b', 'y', 'purple', 'orange', 'k', 'pink', 'grey', 'violet']
cmap = ListedColormap(colors)
norm = BoundaryNorm(range(num_classes+1), cmap.N)
points = np.array([df.index, df['TOTAL']]).T.reshape(-1, 1, 2)
pointsUB = np.array([df.index, df['UB']]).T.reshape(-1, 1, 2)
pointsLB = np.array([df.index, df['LB']]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
segmentsUB = np.concatenate([pointsUB[:-1], pointsUB[1:]], axis=1)
segmentsLB = np.concatenate([pointsLB[:-1], pointsLB[1:]], axis=1)
lc = LineCollection(segments, cmap=cmap, norm=norm, linestyles='dashed')
lc.set_array(df['Label'])
lcUB = LineCollection(segmentsUB, cmap=cmap, norm=norm, linestyles='solid')
lcUB.set_array(df['Label'])
lcLB = LineCollection(segmentsLB, cmap=cmap, norm=norm, linestyles='solid')
lcLB.set_array(df['Label'])
fig1 = plt.figure()
plt.gca().add_collection(lc)
plt.gca().add_collection(lcUB)
plt.gca().add_collection(lcLB)
for i in range(len(colors)):
plt.fill_between( df.index,df['UB'],df['LB'], where= ((df.index >= i) & (df.index <= i+1)), alpha = 0.1,color=colors[i])
plt.xlim(df.index.min(), df.index.max())
plt.ylim(-3.1, 3.1)
plt.show()
And the result dataframe obtained looks like this:
TOTAL Label UB LB
0 0.681455 0 2.681455 -1.318545
1 0.987058 1 2.987058 -1.012942
2 0.212432 2 2.212432 -1.787568
3 0.252284 3 2.252284 -1.747716
4 0.886021 4 2.886021 -1.113979
5 0.369499 5 2.369499 -1.630501
6 0.765192 6 2.765192 -1.234808
7 0.747923 7 2.747923 -1.252077
8 0.543212 8 2.543212 -1.456788
9 0.793860 9 2.793860 -1.206140
And the plot looks like this:
Let me know if this helps! :)

How to show precentage in Seaborn countplot [duplicate]

I was wondering if it is possible to create a Seaborn count plot, but instead of actual counts on the y-axis, show the relative frequency (percentage) within its group (as specified with the hue parameter).
I sort of fixed this with the following approach, but I can't imagine this is the easiest approach:
# Plot percentage of occupation per income class
grouped = df.groupby(['income'], sort=False)
occupation_counts = grouped['occupation'].value_counts(normalize=True, sort=False)
occupation_data = [
{'occupation': occupation, 'income': income, 'percentage': percentage*100} for
(income, occupation), percentage in dict(occupation_counts).items()
]
df_occupation = pd.DataFrame(occupation_data)
p = sns.barplot(x="occupation", y="percentage", hue="income", data=df_occupation)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
Result:
I'm using the well known adult data set from the UCI machine learning repository. The pandas dataframe is created like this:
# Read the adult dataset
df = pd.read_csv(
"data/adult.data",
engine='c',
lineterminator='\n',
names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income'],
header=None,
skipinitialspace=True,
na_values="?"
)
This question is sort of related, but does not make use of the hue parameter. And in my case I cannot just change the labels on the y-axis, because the height of the bar must depend on the group.
With newer versions of seaborn you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
(df
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
output
Update: Also show percentages on top of barplots
If you also want percentages, you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)
for p in g.ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
g.ax.text(txt_x,txt_y,txt)
I might be confused. The difference between your output and the output of
occupation_counts = (df.groupby(['income'])['occupation']
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
.sort_values('occupation'))
p = sns.barplot(x="occupation", y="percentage", hue="income", data=occupation_counts)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
is, it seems to me, only the order of the columns.
And you seem to care about that, since you pass sort=False. But then, in your code the order is determined uniquely by chance (and the order in which the dictionary is iterated even changes from run to run with Python 3.5).
You could do this with sns.histplot by setting the following properties:
stat = 'density' (this will make the y-axis the density rather than count)
common_norm = False (this will normalize each density independently)
See the simple example below:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(x = df['class'], hue=df['survived'], multiple="dodge",
stat = 'density', shrink = 0.8, common_norm=False)
You can use the library Dexplot to do counting as well as normalizing over any variable to get relative frequencies.
Pass the count function the name of the variable you would like to count and it will automatically produce a bar plot of the counts of all unique values. Use split to subdivide the counts by another variable. Notice that Dexplot automatically wraps the x-tick labels.
dxp.count('occupation', data=df, split='income')
Use the normalize parameter to normalize the counts over any variable (or combination of variables with a list). You can also use True to normalize over the grand total of counts.
dxp.count('occupation', data=df, split='income', normalize='income')
It boggled my mind that Seaborn doesn't provide anything like this out of the box.
Still, it was pretty easy to tweak the source code to get what you wanted.
The following code, with the function "percentageplot(x, hue, data)" works just like sns.countplot, but norms each bar per group (i.e. divides each green bar's value by the sum of all green bars)
In effect, it turns this (hard to interpret because different N of Apple vs. Android):
sns.countplot
into this (Normed so that bars reflect proportion of total for Apple, vs Android):
Percentageplot
Hope this helps!!
from seaborn.categorical import _CategoricalPlotter, remove_na
import matplotlib as mpl
class _CategoricalStatPlotter(_CategoricalPlotter):
#property
def nested_width(self):
"""A float with the width of plot elements when hue nesting is used."""
return self.width / len(self.hue_names)
def estimate_statistic(self, estimator, ci, n_boot):
if self.hue_names is None:
statistic = []
confint = []
else:
statistic = [[] for _ in self.plot_data]
confint = [[] for _ in self.plot_data]
for i, group_data in enumerate(self.plot_data):
# Option 1: we have a single layer of grouping
# --------------------------------------------
if self.plot_hues is None:
if self.plot_units is None:
stat_data = remove_na(group_data)
unit_data = None
else:
unit_data = self.plot_units[i]
have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1)
stat_data = group_data[have]
unit_data = unit_data[have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic.append(np.nan)
else:
statistic.append(estimator(stat_data, len(np.concatenate(self.plot_data))))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint.append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint.append(utils.ci(boots, ci))
# Option 2: we are grouping by a hue layer
# ----------------------------------------
else:
for j, hue_level in enumerate(self.hue_names):
if not self.plot_hues[i].size:
statistic[i].append(np.nan)
if ci is not None:
confint[i].append((np.nan, np.nan))
continue
hue_mask = self.plot_hues[i] == hue_level
group_total_n = (np.concatenate(self.plot_hues) == hue_level).sum()
if self.plot_units is None:
stat_data = remove_na(group_data[hue_mask])
unit_data = None
else:
group_units = self.plot_units[i]
have = pd.notnull(
np.c_[group_data, group_units]
).all(axis=1)
stat_data = group_data[hue_mask & have]
unit_data = group_units[hue_mask & have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic[i].append(np.nan)
else:
statistic[i].append(estimator(stat_data, group_total_n))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint[i].append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint[i].append(utils.ci(boots, ci))
# Save the resulting values for plotting
self.statistic = np.array(statistic)
self.confint = np.array(confint)
# Rename the value label to reflect the estimation
if self.value_label is not None:
self.value_label = "{}({})".format(estimator.__name__,
self.value_label)
def draw_confints(self, ax, at_group, confint, colors,
errwidth=None, capsize=None, **kws):
if errwidth is not None:
kws.setdefault("lw", errwidth)
else:
kws.setdefault("lw", mpl.rcParams["lines.linewidth"] * 1.8)
for at, (ci_low, ci_high), color in zip(at_group,
confint,
colors):
if self.orient == "v":
ax.plot([at, at], [ci_low, ci_high], color=color, **kws)
if capsize is not None:
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_low, ci_low], color=color, **kws)
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_high, ci_high], color=color, **kws)
else:
ax.plot([ci_low, ci_high], [at, at], color=color, **kws)
if capsize is not None:
ax.plot([ci_low, ci_low],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
ax.plot([ci_high, ci_high],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
class _BarPlotter(_CategoricalStatPlotter):
"""Show point estimates and confidence intervals with bars."""
def __init__(self, x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation, errcolor, errwidth=None,
capsize=None):
"""Initialize the plotter."""
self.establish_variables(x, y, hue, data, orient,
order, hue_order, units)
self.establish_colors(color, palette, saturation)
self.estimate_statistic(estimator, ci, n_boot)
self.errcolor = errcolor
self.errwidth = errwidth
self.capsize = capsize
def draw_bars(self, ax, kws):
"""Draw the bars onto `ax`."""
# Get the right matplotlib function depending on the orientation
barfunc = ax.bar if self.orient == "v" else ax.barh
barpos = np.arange(len(self.statistic))
if self.plot_hues is None:
# Draw the bars
barfunc(barpos, self.statistic, self.width,
color=self.colors, align="center", **kws)
# Draw the confidence intervals
errcolors = [self.errcolor] * len(barpos)
self.draw_confints(ax,
barpos,
self.confint,
errcolors,
self.errwidth,
self.capsize)
else:
for j, hue_level in enumerate(self.hue_names):
# Draw the bars
offpos = barpos + self.hue_offsets[j]
barfunc(offpos, self.statistic[:, j], self.nested_width,
color=self.colors[j], align="center",
label=hue_level, **kws)
# Draw the confidence intervals
if self.confint.size:
confint = self.confint[:, j]
errcolors = [self.errcolor] * len(offpos)
self.draw_confints(ax,
offpos,
confint,
errcolors,
self.errwidth,
self.capsize)
def plot(self, ax, bar_kws):
"""Make the plot."""
self.draw_bars(ax, bar_kws)
self.annotate_axes(ax)
if self.orient == "h":
ax.invert_yaxis()
def percentageplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None,
orient=None, color=None, palette=None, saturation=.75,
ax=None, **kwargs):
# Estimator calculates required statistic (proportion)
estimator = lambda x, y: (float(len(x))/y)*100
ci = None
n_boot = 0
units = None
errcolor = None
if x is None and y is not None:
orient = "h"
x = y
elif y is None and x is not None:
orient = "v"
y = x
elif x is not None and y is not None:
raise TypeError("Cannot pass values for both `x` and `y`")
else:
raise TypeError("Must pass values for either `x` or `y`")
plotter = _BarPlotter(x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation,
errcolor)
plotter.value_label = "Percentage"
if ax is None:
ax = plt.gca()
plotter.plot(ax, kwargs)
return ax
You can provide estimators for the height of the bar (along y axis) in a seaborn countplot by using the estimator keyword.
ax = sns.barplot(x="x", y="x", data=df, estimator=lambda x: len(x) / len(df) * 100)
The above code snippet is from https://github.com/mwaskom/seaborn/issues/1027
They have a whole discussion about how to provide percentages in a countplot. This answer is based off the same thread linked above.
In the context of your specific problem, you can probably do something like this:
ax = sb.barplot(x='occupation', y='some_numeric_column', data=raw_data, estimator=lambda x: len(x) / len(raw_data) * 100, hue='income')
ax.set(ylabel="Percent")
The above code worked for me (on a different dataset with different attributes). Note that you need to put in some numeric column for y else, it gives an error: "ValueError: Neither the x nor y variable appears to be numeric."
From this answer, and using "probability" worked best.
Taken from sns.histplot documentation on the "stat" parameter:
Aggregate statistic to compute in each bin.
count: show the number of observations in each bin
frequency: show the number of observations divided by the bin width
probability: or proportion: normalize such that bar heights sum to 1
percent: normalize such that bar heights sum to 100
density: normalize such that the total area of the histogram equals 1
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(
x = df['class'],
hue=df['survived'],
multiple="dodge",
stat = 'probability',
shrink = 0.5,
common_norm=False
)

matplotlib scatter with c=date

How to plot a pandas dataframe like the one below with x on the x-axis, the values on the y-axis (one line per row) and the lines colored by date
values = [[0.2, 3.1, 17.4, 28.9, 57.7, 76.9, 82.8, 87.6, 92.4, 98.9, 100.0],
[0.2, 2.1, 15.5, 26.0, 54.2, 75.6, 82.1, 87.4, 92.4, 98.9, 100.0]]
x = [0.1, 0.2, 0.315, 0.4, 0.63, 1, 1.25, 1.6, 2, 3.15, 4]
dates = pd.date_range(start='2017-07-01', freq='D', periods=2)
data = pd.DataFrame(data=values, columns=x)
data['dates'] = dates
edit: sorry for not being precise.
Is there a way to set the colors of the lines according to a columns of Timestamps using data[x].T.plot(kind='line', legend=False).
If this is not possible, how to set "c" in plt.scatter to an array of Timestamps?
edit: the plot should look like this but should have a colorbar instead of a legend
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# create test data with a structure similar to the real data
x_values = np.linspace(1, 10, 8)
dat = np.random.randn(100, 8)
df = pd.DataFrame(data=np.abs(dat), columns=x_values)
df = df.cumsum(axis=1)
df = df.divide(df.max(axis=1), axis='index')
# create discontinuos date range and add it to data frame
dates = pd.date_range(start=('2016-01-01'), end=('2017-05-01'), freq='D')
dates = dates[(dates < '2016-07-01') | (dates > '2017-03-01')]
df['date'] = sorted(random.sample(dates.date.tolist(), 100))
# create a dataframe with a continous date range (see df) and corresponding colors
drange = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
colors = iter(plt.cm.jet(np.linspace(0, 1, drange.shape[0])))
cdf = pd.DataFrame(data=np.array([drange.date, list(colors)]).T, columns=['date', 'colors'])
# and merge colors to data
data = pd.merge(df, cdf)
# plot all data row by row with color of lines
# matching the date columns
fig, ax = plt.subplots()
for idx in data.index:
ax.plot(x_values, data.loc[idx, x_values],
linestyle='-', alpha=0.75,
color=data.loc[idx, 'colors'],
label=data.loc[idx, 'date'])
# reduce entries of legend
handles, labels = ax.get_legend_handles_labels()
entries = int(data.shape[0]/10)
handles = handles[::entries]
labels = labels[::entries]
ax.legend(handles, labels)

Grid of histograms according to filtered data

Consider this kind of data file:
data-file.txt
75,15,1,57.5,9.9,5
75,15,1,58.1,10.0,5
75,15,2,37.9,8.3,5
75,15,2,18.2,7.3,5
150,15,1,26.4,8.3,10
150,15,1,31.6,7.9,10
150,15,2,30.6,7.5,10
150,15,2,25.1,7.1,10
Observe that 3rd column values are only 1,2.
I would like to produce 3x2-grid of histograms. The subplots below looks right, but each row should contain 2 histograms from different data set, I mean, I filter the data according to last column.
The important code is ax.hist(X[ (y==grp) & (X[:,2]==1), cols], where the filter occurs.
I want 2 histograms on each row:
the 1st row with (X[:,2]== * ) where * being any value from 3rd column (1 or 2),
the 2nd row with (X[:,2]==1) and
the 3rd row with (X[:,2]==2).
In resume, I expect to get on 2nd, 3rd rows histograms for the filtered data:
3rd column value = 1
75,15,1,57.5,9.9,5
75,15,1,58.1,10.0,5
150,15,1,26.4,8.3,10
150,15,1,31.6,7.9,10
3rd column value = 2
75,15,2,37.9,8.3,5
75,15,2,18.2,7.3,5
150,15,2,30.6,7.5,10
150,15,2,25.1,7.1,10
Code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
from itertools import combinations
data_file='data-file.txt'
df = pd.io.parsers.read_csv(
filepath_or_buffer=data_file,
delim_whitespace=False,
)
M, N = df.shape[0], df.shape[1]
feature_dict = {i+1:label for i,label in zip(
range(N),
('L',
'A',
'G',
'P',
'T',
'PP',
))}
df.columns = [l for i,l in sorted(feature_dict.items())]
X = df[range(N-1)].values
y = df['PP'].values
label_dict = dict(enumerate(sorted(list(set(y)))))
label_dict = {x+1:y for x,y in label_dict.iteritems()}
num_grupos = len(label_dict.keys())
grps_to_hist_list = [[j for j in i] for i in combinations(label_dict.keys(), 2)]
grps_to_hist_list_values = [[j for j in i] for i in combinations(label_dict.values(), 2)]
cols_to_hist = [3, 4]
for grps_to_hist in grps_to_hist_list:
grps_str = [ label_dict[grps_to_hist[0]], label_dict[grps_to_hist[1]] ]
print 'creating histogram for groups %s from data file %s' % (grps_str , data_file)
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18,8))
for ax,cols in zip(axes.ravel(), cols_to_hist):
# set bin sizes
min_b = math.floor(np.min(X[:,cols]))
max_b = math.ceil(np.max(X[:,cols]))
bins = np.linspace(min_b, max_b, 40)
# ploting the histograms
#"""
for grp,color in zip( grps_str, ('blue', 'red')):
ax.hist(X[ (y==grp) & (X[:,2]==1), cols],
color=color,
label='%s' % grp,
bins=bins,
alpha=0.3,)
ylims = ax.get_ylim()
# plot annotation
leg = ax.legend(loc='upper right', fancybox=True, fontsize=8)
leg.get_frame().set_alpha(0.5)
ax.set_ylim([0, max(ylims)+2])
ax.set_xlabel(feature_dict[cols+1])
ax.set_title('%s' % str(data_file))
# hide axis ticks
ax.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
#"""
fig.tight_layout()
plt.show()
Here is a screen-shot from the code above with the filter (y==grp) & (X[:,2]==1) (which should be on 2nd row).
My logic is to iterate over rows with corresponding masks of your choice, [(X[:,2]==1) | (X[:,2]==2), X[:,2]==1, X[:,2]==2]. Hopefully this is what you want:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
from itertools import combinations
data_file='data-file.txt'
df = pd.io.parsers.read_csv(
filepath_or_buffer=data_file,
delim_whitespace=False,
)
M, N = df.shape[0], df.shape[1]
feature_dict = {i+1:label for i,label in zip(
range(N),
('L',
'A',
'G',
'P',
'T',
'PP',
))}
df.columns = [l for i,l in sorted(feature_dict.items())]
X = df[range(N-1)].values
y = df['PP'].values
label_dict = dict(enumerate(sorted(list(set(y)))))
label_dict = {x+1:y for x,y in label_dict.iteritems()}
num_grupos = len(label_dict.keys())
grps_to_hist_list = [[j for j in i] for i in combinations(label_dict.keys(), 2)]
grps_to_hist_list_values = [[j for j in i] for i in combinations(label_dict.values(), 2)]
cols_to_hist = [3, 4]
for grps_to_hist in grps_to_hist_list:
grps_str = [ label_dict[grps_to_hist[0]], label_dict[grps_to_hist[1]] ]
print 'creating histogram for groups %s from data file %s' % (grps_str , data_file)
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18,8))
for row_ax, row_mask in zip(axes, [(X[:,2]==1) | (X[:,2]==2), X[:,2]==1, X[:,2]==2]):
for ax,cols in zip(row_ax, cols_to_hist):
# set bin sizes
min_b = math.floor(np.min(X[:,cols]))
max_b = math.ceil(np.max(X[:,cols]))
bins = np.linspace(min_b, max_b, 40)
# ploting the histograms
#"""
for grp,color in zip( grps_str, ('blue', 'red')):
ax.hist(X[ (y==grp) & row_mask, cols],
color=color,
label='%s' % grp,
bins=bins,
alpha=0.3,)
ylims = ax.get_ylim()
# plot annotation
leg = ax.legend(loc='upper right', fancybox=True, fontsize=8)
leg.get_frame().set_alpha(0.5)
ax.set_ylim([0, max(ylims)+2])
ax.set_xlabel(feature_dict[cols+1])
ax.set_title('%s' % str(data_file))
# hide axis ticks
ax.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on")
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
#"""
fig.tight_layout()
plt.show()