I have this simple dataframe:
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
And as I have many columns (all of them numeric), I did this loop in order to do a specific plot:
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.vlines(x=data.mean(),ymin=0, ymax=0.01, linestyles="dotted")
plt.show()
However, I'm having trouble trying to generalize the ymax argument of plt.vlines(), as I need to get the maximum y-axis value of each density plot in order to plot the mean vline of each plot accordingly. I have tried with np.argmax(), but it doesn't seem to work.
Any suggestions?
pandas.DataFrame.plot() returns matplotlib.axes.Axes object. You can use get_ylim() function to get ymin and ymax.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
for i in df.columns:
data = df[i]
ax = data.plot(kind="kde")
ymin, ymax = ax.get_ylim()
plt.vlines(x=data.mean(),ymin=ymin, ymax=ymax, linestyles="dotted")
plt.show()
To get the value of the kde corresponding to the mean, you could extract the curve from the plot and interpolate it at the position of the mean:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": 20 + np.random.randint(-1, 2, size=100).cumsum(),
"Y": 30 + np.random.randint(-1, 2, size=100).cumsum(),
"Z": 40 + np.random.randint(-1, 2, size=100).cumsum()})
fig, ax = plt.subplots()
for col in df.columns:
data = df[col]
data.plot(kind="kde", ax=ax)
x = data.mean()
kdeline = ax.lines[-1]
ymax = np.interp(x, kdeline.get_xdata(), kdeline.get_ydata())
ax.vlines(x=data.mean(), ymin=0, ymax=ymax, linestyles="dotted")
ax.set_ylim(ymin=0) # ax.vlines() moves the bottom ylim; set it back to 0
plt.show()
Use plt.axvline. You specify the limits as numbers in the range [0,1], 0 being the bottom of the plot, 1 being the top.
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.axvline(data.mean(), 0, 1, linestyle='dotted', color='black')
plt.show()
Related
I have data with a lot of values. When plotting a percentage, a lot of values show up in 0%, which are then displayed in a plot. I do not want to include labels which are less than 0% or n%.
This is the code that I use to produce the output
import numpy
import pandas as pd
import matplotlib.pyplot as plt
data = np.random.rand(5,10)
data = 10 + data*10
df = pd.DataFrame(data, columns=list('ABCDEFGHIJ'))
ax = df.plot(kind='bar', stacked=True)
for c in ax.containers:
ax.bar_label(c, fmt='%.0f%%', label_type='center')
ax.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
I know that I can do what I need using this
ax = df.plot(kind='bar', stacked=True)
for c in ax.containers:
labels = [v if v > 12 else "" for v in c.datavalues]
ax.bar_label(c, labels=labels, label_type="center")
ax.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')
This way I can suppress values less than 12, but how can I limit amount of decimals which will be shown in label like this fmt='%.0f%%' ?
I have a data frame with four columns I would like to plot the normality test for each column in a 2*2 grid, but it only plot one figure, and the else is empty.
import random
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2,2, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
data = {'col1': [random.randrange(1, 50, 1) for i in range(1000)], 'col2': [random.randrange(1, 50, 1) for i in range(1000)],'col3':[random.randrange(1, 50, 1) for i in range(1000)]
,'col4':[random.randrange(1, 50, 1) for i in range(1000)]}
df = pd.DataFrame(data)
for ax, d in zip(axs.ravel(), df):
ax=stats.probplot(df[d], plot=plt)
#ax.set_title(str(d))
plt.show()
is there a way to construct the subplot and the stats.probplot within a loop?
In your code, you need to change the for loop to this:
for ax, d in zip(axs.ravel(), df):
stats.probplot(df[d], plot=ax)
#ax.set_titl(str(d))
plt.show()
I hope this will help you move on.
I struggle hard to succeed in plotting a dot-line between the median values (and min and max) per type of stacked violin distributions.
I tried superposing a violin plot with a seaborn.lineplot but it failed. I'm not sure with this approach that I can draw dot-lines and also link min and max of distributions of the same type. I also tried to use seaborn.lineplot but here the challenge is to plot min and max of the distribution at each x-axis value.
Here is a example dataset and the code for the violin plot in seaborn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
x=[0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8]
cate=['a','a','a','a','b','b','b','b','c','c','c','c','a','a','a','a','b','b','b','b','c','c','c','c','a','a','a','a','b','b','b','b','c','c','c','c','a','a','a','a','b','b','b','b','c','c','c','c']
y=[1.1,1.12,1.13,1.13,3.1,3.12,3.13,3.13,5.1,5.12,5.13,5.13,2.2,2.22,2.25,2.23,4.2,4.22,4.25,4.23,6.2,6.22,6.25,6.23,2.2,2.22,2.24,2.23,4.2,4.22,4.24,4.23,6.2,6.22,6.24,6.23,1.1,1.13,1.14,1.12,3.1,3.13,3.14,3.12,5.1,5.13,5.14,5.12]
my_pal =['red','green', 'purple']
df = pd.DataFrame({'x': x, 'Type': cate, 'y': y})
ax=sns.catplot(y='y', x='x',data=df, hue='Type', palette=my_pal, kind="violin",dodge =False)
sns.lineplot(y='y', x='x',data=df, hue='Type', palette=my_pal, ci=100,legend=False)
plt.show()
but it plots line only on a reduce part of the left of the plot. Is there a trick to superpose lineplot with violin plot?
For the line plot, 'x' is considered numerical. However, for the violin plot 'x' is considered categorical (positioned at 0, 1, 2, ...).
A solution is to convert 'x' to strings to have both plots consider it as categorical.
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
my_pal = ['red', 'green', 'purple']
N = 40
df = pd.DataFrame({'x': np.random.randint(1, 6, N*3) * 0.2,
'y': np.random.uniform(0, 1, N*3) + np.tile([2, 4, 6], N),
'Type': np.tile(list('abc'), N)})
df['x'] = [f'{x:.1f}' for x in df['x']]
ax = sns.violinplot(y='y', x='x', data=df, hue='Type', palette=my_pal, dodge=False)
ax = sns.lineplot(y='y', x='x', data=df, hue='Type', palette=my_pal, ci=100, legend=False, ax=ax)
ax.margins(0.15) # slightly more padding for x and y axis
ax.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()
How to plot a histogram with pandas DataFrame.hist() using group by?
I have a data frame with 5 columns: "A", "B", "C", "D" and "Group"
There are two Groups classes: "yes" and "no"
Using:
df.hist()
I get the hist for each of the 4 columns.
Now I would like to get the same 4 graphs but with blue bars (group="yes") and red bars (group = "no").
I tried this withouth success:
df.hist(by = "group")
Using Seaborn
If you are open to use Seaborn, a plot with multiple subplots and multiple variables within each subplot can easily be made using seaborn.FacetGrid.
import numpy as np; np.random.seed(1)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(300,4), columns=list("ABCD"))
df["group"] = np.random.choice(["yes", "no"], p=[0.32,0.68],size=300)
df2 = pd.melt(df, id_vars='group', value_vars=list("ABCD"), value_name='value')
bins=np.linspace(df2.value.min(), df2.value.max(), 10)
g = sns.FacetGrid(df2, col="variable", hue="group", palette="Set1", col_wrap=2)
g.map(plt.hist, 'value', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()
This is not the most flexible workaround but will work for your question specifically.
def sephist(col):
yes = df[df['group'] == 'yes'][col]
no = df[df['group'] == 'no'][col]
return yes, no
for num, alpha in enumerate('abcd'):
plt.subplot(2, 2, num)
plt.hist(sephist(alpha)[0], bins=25, alpha=0.5, label='yes', color='b')
plt.hist(sephist(alpha)[1], bins=25, alpha=0.5, label='no', color='r')
plt.legend(loc='upper right')
plt.title(alpha)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
You could make this more generic by:
adding a df and by parameter to sephist: def sephist(df, by, col)
making the subplots loop more flexible: for num, alpha in enumerate(df.columns)
Because the first argument to matplotlib.pyplot.hist can take
either a single array or a sequency of arrays which are not required
to be of the same length
...an alternattive would be:
for num, alpha in enumerate('abcd'):
plt.subplot(2, 2, num)
plt.hist((sephist(alpha)[0], sephist(alpha)[1]), bins=25, alpha=0.5, label=['yes', 'no'], color=['r', 'b'])
plt.legend(loc='upper right')
plt.title(alpha)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
I generalized one of the other comment's solutions. Hope it helps someone out there. I added a line to ensure binning (number and range) is preserved for each column, regardless of group. The code should work for both "binary" and "categorical" groupings, i.e. "by" can specify a column wherein there are N number of unique groups. Plotting also stops if the number of columns to plot exceeds the subplot space.
import numpy as np
import matplotlib.pyplot as plt
def composite_histplot(df, columns, by, nbins=25, alpha=0.5):
def _sephist(df, col, by):
unique_vals = df[by].unique()
df_by = dict()
for uv in unique_vals:
df_by[uv] = df[df[by] == uv][col]
return df_by
subplt_c = 4
subplt_r = 5
fig = plt.figure()
for num, col in enumerate(columns):
if num + 1 > subplt_c * subplt_r:
continue
plt.subplot(subplt_c, subplt_r, num+1)
bins = np.linspace(df[col].min(), df[col].max(), nbins)
for lbl, sepcol in _sephist(df, col, by).items():
plt.hist(sepcol, bins=bins, alpha=alpha, label=lbl)
plt.legend(loc='upper right', title=by)
plt.title(col)
plt.tight_layout()
return fig
TLDR oneliner;
It won't create the subplots but will create 4 different plots;
[df.groupby('group')[i].plot(kind='hist',title=i)[0] and plt.legend() and plt.show() for i in 'ABCD']
Full working example below
import numpy as np; np.random.seed(1)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(300,4), columns=list("ABCD"))
df["group"] = np.random.choice(["yes", "no"], p=[0.32,0.68],size=300)
[df.groupby('group')[i].plot(kind='hist',title=i)[0] and plt.legend() and plt.show() for i in 'ABCD']
I would like to format my x axis with the legend values at the mid point of each bar whilst retaining the gender group identification. I'd like lower the gender groups to sit below the other xticklabels for clarity.
To this point, I've added xticks but actually labeling them correctly and neatly is proving trickier.
from itertools import chain, cycle
import logging
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
matplotlib.style.use("ggplot")
m = {"Males" : {"Yes": 2, "No": 8}}
w = {"Females": {"Yes": 3, "No": 7}}
data = {**m, **w}
df = DataFrame(data)
# relative freq table
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
plt.show()
The following might be what you're looking for.
from itertools import chain
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame
matplotlib.style.use("ggplot")
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
labels = [l for l in ax.get_xticklabels()]
for i,l in enumerate(labels[len(df_ft):]):
l.set_text(df_ft.columns[i % len(df_ft.columns)])
for i,l in enumerate(labels[:len(df_ft)]):
l.set_text("\n"+l.get_text())
ax.set_xticklabels(labels)
plt.savefig(__file__+".png")
plt.show()
Altair would do a great job here.
from altair import *
from pandas import DataFrame
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df = df.stack().reset_index()
df.columns=['response','gender','count']
Vis #1
Chart(df).mark_bar().encode(x='gender',y='count',color='response').configure_cell(width=200, height=200)
Vis 2
Chart(df).mark_bar().encode(x=X('response', axis=False),
y=Y('count', axis=Axis(grid=False)),
color='response',
column=Column('gender', axis=Axis(axisWidth=1.0, offset=-8.0, orient='bottom'),scale=Scale(padding=30.0))).configure_cell(width=200, height=200).configure_facet_cell(strokeWidth=0)