Plotting data binned in a pandas dataframe in a scatterplot - pandas

I've got a large amount of astronomical data that I need to plot in a scatterplot. I've binned the data according to distance, and I want to plot 4 scatterplots, side by side.
For the purposes of asking this question, I've constructed a MWE based, obviously with different data, on what I've got so far:
import pandas as pd
import matplotlib.pyplot as plt
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky', 'Jim', 'Lee', 'Rob', 'Dave',
'Jane', 'Bronwyn', 'Karen', 'Liz', 'Claire', 'Chris', 'Jan', 'Ruby'],
'Age':[28,34,29,42,14,16,75,68,
27,3,2,19,17,32,71,45],
'Weight':[60,75,73,82,54,55,98,82,45,9,8,47,54,62,67,67]}
stages = ['Toddler', 'Teen', ' Young Adult', 'Adult']
ages = [0,4,20,40,100]
df = pd.DataFrame(data)
df['binned'] = pd.cut(df['Age'], bins=ages, labels=stages)
fig=plt.figure()
fig.subplots_adjust(hspace=0)
fig.subplots_adjust(wspace=0)
gridsize = 1,4
ax1 = plt.subplot2grid(gridsize, (0,0))
ax1.scatter(df['Name'], df['Weight'], alpha = 0.5)
ax1.set_ylabel('Weight, kg', fontsize=20)
ax1.set_xlabel('Name', fontsize=20)
ax2 = plt.subplot2grid(gridsize, (0,1), sharey=ax1, sharex = ax1)
plt.setp(ax2.get_yticklabels(), visible=False)
ax2.scatter(df['Name'], df['Weight'], alpha = 0.5)
ax2.set_xlabel('Name', fontsize=20)
ax3 = plt.subplot2grid(gridsize, (0,2), sharey=ax1, sharex = ax1)
plt.setp(ax3.get_yticklabels(), visible=False)
ax3.scatter(df['Name'], df['Weight'], alpha = 0.5)
ax3.set_xlabel('Name', fontsize=20)
ax4 = plt.subplot2grid(gridsize, (0,3), sharey=ax1, sharex = ax1)
plt.setp(ax4.get_yticklabels(), visible=False)
ax4.scatter(df['Name'], df['Weight'], alpha = 0.5)
ax4.set_xlabel('Name', fontsize=20)
This plots four graphs as expected:
but how do I get each graph to plot only the data from one of each of the bins? In other words, how do I plot just one of the bins?
I'm not worried about the scrunching up of the names on the x axis, those are just for this MWE. They'll be numbers in my actual plots.
Just for clarification, my actual data is binned like
sources['z bins']=pd.cut(sources['z'], [0,1,2,3, max(z)],
labels = ['z < 1', '1 < z < 2', '2 < z < 3', 'z > 3'])

What if you grouped the dataframe by binned and then plotted each group?
For example:
fig=plt.figure()
fig.subplots_adjust(hspace=0)
fig.subplots_adjust(wspace=0)
gridsize = 1,4
for i, (name, frame) in enumerate(df.groupby('binned')):
ax = plt.subplot2grid(gridsize, (0,i))
ax.scatter(frame['Name'], frame['Weight'], alpha = 0.5)
ax.set_xlabel(name, fontsize=20)
I realize you will likely want to clean up the labels a bit, but this at least puts the different bins on a different axes object.
You can iterate over a groupby object and return the name of the group and the dataframe of that group. Here I am using enumerate in order to increment the axes object
Alternatively if you do not want to use a for loop you can access each group with the get_group method of a groupby object.
grouped = df.groupby('binned')
ax1 = plt.subplot2grid(gridsize, (0,0))
ax1.scatter(grouped.get_group('Toddler')['Name'],
grouped.get_group('Toddler')['Weight'],
alpha = 0.5)
ax1.set_ylabel('Weight, kg', fontsize=20)
ax1.set_xlabel('Name', fontsize=20)

Related

Showing Matplotlib pie chart only top 3 item's percentage [duplicate]

I have the following code:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(123456)
import pandas as pd
df = pd.DataFrame(3 * np.random.rand(4, 4), index=['a', 'b', 'c', 'd'],
columns=['x', 'y','z','w'])
plt.style.use('ggplot')
colors = plt.rcParams['axes.color_cycle']
fig, axes = plt.subplots(nrows=2, ncols=3)
for ax in axes.flat:
ax.axis('off')
for ax, col in zip(axes.flat, df.columns):
ax.pie(df[col], labels=df.index, autopct='%.2f', colors=colors)
ax.set(ylabel='', title=col, aspect='equal')
axes[0, 0].legend(bbox_to_anchor=(0, 0.5))
fig.savefig('your_file.png') # Or whichever format you'd like
plt.show()
Which produce the following:
My question is, how can I remove the label based on a condition. For example I'd only want to display labels with percent > 20%. Such that the labels and value of a,c,d won't be displayed in X, etc.
The autopct argument from pie can be a callable, which will receive the current percentage. So you only would need to provide a function that returns an empty string for the values you want to omit the percentage.
Function
def my_autopct(pct):
return ('%.2f' % pct) if pct > 20 else ''
Plot with matplotlib.axes.Axes.pie
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
for ax, col in zip(axes.flat, df.columns):
ax.pie(df[col], labels=df.index, autopct=my_autopct)
ax.set(ylabel='', title=col, aspect='equal')
fig.tight_layout()
Plot directly with the dataframe
axes = df.plot(kind='pie', autopct=my_autopct, figsize=(8, 6), subplots=True, layout=(2, 2), legend=False)
for ax in axes.flat:
yl = ax.get_ylabel()
ax.set(ylabel='', title=yl)
fig = axes[0, 0].get_figure()
fig.tight_layout()
If you need to parametrize the value on the autopct argument, you'll need a function that returns a function, like:
def autopct_generator(limit):
def inner_autopct(pct):
return ('%.2f' % pct) if pct > limit else ''
return inner_autopct
ax.pie(df[col], labels=df.index, autopct=autopct_generator(20), colors=colors)
For the labels, the best thing I can come up with is using list comprehension:
for ax, col in zip(axes.flat, df.columns):
data = df[col]
labels = [n if v > data.sum() * 0.2 else ''
for n, v in zip(df.index, data)]
ax.pie(data, autopct=my_autopct, colors=colors, labels=labels)
Note, however, that the legend by default is being generated from the first passed labels, so you'll need to pass all values explicitly to keep it intact.
axes[0, 0].legend(df.index, bbox_to_anchor=(0, 0.5))
For labels I have used:
def my_level_list(data):
list = []
for i in range(len(data)):
if (data[i]*100/np.sum(data)) > 2 : #2%
list.append('Label '+str(i+1))
else:
list.append('')
return list
patches, texts, autotexts = plt.pie(data, radius = 1, labels=my_level_list(data), autopct=my_autopct, shadow=True)
You can make the labels function a little shorter using list comprehension:
def my_autopct(pct):
return ('%1.1f' % pct) if pct > 1 else ''
def get_new_labels(sizes, labels):
new_labels = [label if size > 1 else '' for size, label in zip(sizes, labels)]
return new_labels
fig, ax = plt.subplots()
_,_,_ = ax.pie(sizes, labels=get_new_labels(sizes, labels), colors=colors, autopct=my_autopct, startangle=90, rotatelabels=False)

Create multiple stacked bar-lots in one figure

The first image is the figure I'm trying to reproduce, and the second image is the data I have. Does anyone have a clean way to do this with pandas or matplotlib?
Just transpose the DataFrame and use df.plot with the stacked flag set to true:
import pandas as pd
from matplotlib import pyplot as plt
df = pd.DataFrame({'squad': [0.6616, 0.1245, 0.0950],
'quac': [0.83, 0.065, 0.0176],
'quoref': [0.504, 0.340364, 0.1067]})
# Transpose
plot_df = df.T
# plot
ax = plot_df.plot(kind='bar', stacked=True, rot='horizontal')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
ax.set_ylabel("% of Questions")
plt.tight_layout()
plt.show()
You can try this:
data = {'squad':[0.661669, 0.127516, 0.095005],
'quac':[0.930514, 0.065951, 0.017680],
'quoref': [0.504963, 0.340364, 0.106700]}
df = pd.DataFrame(data)
bars_1 = df.iloc[0]
bars_2 = df.iloc[1]
bars_3 = df.iloc[2]
# Heights of bars_1 + bars_2
bars_1_to_2 = np.add(bars_1, bars_2).tolist()
# The position of the bars on the x-axis
r = [0, 1, 2]
plt.figure(figsize = (7, 7))
plt.bar(r, bars_1, color = 'lightgrey', edgecolor = 'white')
plt.bar(r, bars_2, bottom = bars_1, color = 'darkgrey', edgecolor = 'white')
plt.bar(r, bars_3, bottom = bars_1_to_2, color = 'dimgrey', edgecolor = 'white')
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xticks(ticks = r, labels = df.columns)
plt.ylabel('% of Questions')
plt.show()

Is there a way to label each wedge of pie chart in this grid?

I want to have multiple pie charts in a grid.
Each pie chart will have a different number of wedges, values, and labels.
The code below shows multiple labels in one pie chart.
Is there a way to label each wedge of pie-charts in this grid?
import matplotlib.pyplot as plt
import numpy as np
def heatmap_with_circles(data_array,row_labels,column_labels,ax=None, cmap=None, norm=None, cbar_kw={}, cbarlabel="", **kwargs):
for row_index, row in enumerate(row_labels,0):
for column_index, column in enumerate(column_labels,0):
print('row_index: %d column_index: %d' %(row_index,column_index))
if row_index==0 and column_index==0:
colors=['indianred','orange','gray']
values=[10,20,30]
else:
values=[45,20,38]
colors=['pink','violet','green']
wedges, text = plt.pie(values,labels=['0', '2', '3'],labeldistance = 0.25,colors=colors)
print('len(wedges):%d wedges: %s, text: %s' %(len(wedges), wedges, text))
radius = 0.45
[w.set_center((column_index,row_index)) for w in wedges]
[w.set_radius(radius) for w in wedges]
# We want to show all ticks...
ax.set_xticks(np.arange(data_array.shape[1]))
ax.set_yticks(np.arange(data_array.shape[0]))
fontsize=10
ax.set_xticklabels(column_labels, fontsize=fontsize)
ax.set_yticklabels(row_labels, fontsize=fontsize)
#X axis labels at top
ax.tick_params(top=True, bottom=False,labeltop=True, labelbottom=False,pad=5)
plt.setp(ax.get_xticklabels(), rotation=55, ha="left", rotation_mode="anchor")
# We want to show all ticks...
ax.set_xticks(np.arange(data_array.shape[1]+1)-.5, minor=True)
ax.set_yticks(np.arange(data_array.shape[0]+1)-.5, minor=True)
ax.grid(which="minor", color="black", linestyle='-', linewidth=2)
ax.tick_params(which="minor", bottom=False, left=False)
data_array=np.random.rand(3,4)
row_labels=['Row1', 'Row2', 'Row3']
column_labels=['Column1', 'Column2', 'Column3','Column4']
fig, ax = plt.subplots(figsize=(1.9*len(column_labels),1.2*len(row_labels)))
ax.set_aspect(1.0)
ax.set_facecolor('white')
heatmap_with_circles(data_array,row_labels,column_labels, ax=ax)
plt.tight_layout()
plt.show()
After updating heatmap_with_circles
def heatmap_with_circles(data_array,row_labels,column_labels,ax=None, cmap=None, norm=None, cbar_kw={}, cbarlabel="", **kwargs):
labels = ['x', 'y', 'z']
for row_index, row in enumerate(row_labels,0):
for column_index, column in enumerate(column_labels,0):
print('row_index: %d column_index: %d' %(row_index,column_index))
if row_index==0 and column_index==0:
colors=['indianred','orange','gray']
values=[10,20,30]
else:
values=[45,20,38]
colors=['pink','violet','green']
# wedges, texts = plt.pie(values,labels=['0', '2', '3'],labeldistance = 0.45,colors=colors)
wedges, texts = plt.pie(values,labeldistance = 0.25,colors=colors)
print('text:%s len(wedges):%d wedges: %s' %(texts, len(wedges), wedges))
radius = 0.45
[w.set_center((column_index,row_index)) for w in wedges]
[w.set_radius(radius) for w in wedges]
[text.set_position((text.get_position()[0]+column_index,text.get_position()[1]+row_index)) for text in texts]
[text.set_text(labels[text_index]) for text_index, text in enumerate(texts,0)]
I got the following image :)
You could loop through the texts of each pie, get its xy position, add column_index and row_index, and set that as new position.
Some small changes to the existing code:
ax.grid(which="minor", ..., clip_on=False) to make sure the thick lines are shown completely, also near the border
ax.set_xlim(xmin=-0.5) to set the limits
import matplotlib.pyplot as plt
import numpy as np
def heatmap_with_circles(data_array, row_labels, column_labels, ax=None):
ax = ax or plt.gca()
for row_index, row in enumerate(row_labels, 0):
for column_index, column in enumerate(column_labels, 0):
colors = np.random.choice(['indianred', 'orange', 'gray', 'pink', 'violet', 'green'], 3, replace=False)
values = np.random.randint(10, 41, 3)
wedges, text = plt.pie(values, labels=['1', '2', '3'], labeldistance=0.25, colors=colors)
radius = 0.45
for w in wedges:
w.set_center((column_index, row_index))
w.set_radius(radius)
w.set_edgecolor('white')
# w.set_linewidth(1)
for t in text:
x, y = t.get_position()
t.set_position((x + column_index, y + row_index))
# We want to show all ticks...
ax.set_xticks(np.arange(data_array.shape[1]))
ax.set_yticks(np.arange(data_array.shape[0]))
fontsize = 10
ax.set_xticklabels(column_labels, fontsize=fontsize)
ax.set_yticklabels(row_labels, fontsize=fontsize)
# X axis labels at top
ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False, pad=5)
plt.setp(ax.get_xticklabels(), rotation=55, ha="left", rotation_mode="anchor")
# We want to show all minor ticks...
ax.set_xticks(np.arange(data_array.shape[1] + 1) - .5, minor=True)
ax.set_yticks(np.arange(data_array.shape[0] + 1) - .5, minor=True)
ax.set_xlim(xmin=-.5)
ax.set_ylim(ymin=-.5)
ax.grid(which="minor", color="black", linestyle='-', linewidth=2, clip_on=False)
ax.tick_params(axis="both", which="both", length=0) # hide tick marks
data_array = np.random.rand(3, 4)
row_labels = ['Row1', 'Row2', 'Row3']
column_labels = ['Column1', 'Column2', 'Column3', 'Column4']
fig, ax = plt.subplots(figsize=(1.9 * len(column_labels), 1.2 * len(row_labels)))
ax.set_aspect(1.0)
ax.set_facecolor('white')
heatmap_with_circles(data_array, row_labels, column_labels, ax=ax)
plt.tight_layout()
plt.show()

matplotlib scatter plot add legend without loop and without using seaborn

I receive the error No handles with labels found to put in legend. when running the code below. How can I add a legend to this scatter plot that shows the color definitions (a red dot for A, blue dot for B, green dot for C)?
### Dummy Dataset
x = [0,1,-1,4,0,2,2,4,2]
y = [1,5,9,2,4,2,5,6,1]
cat = ['A','B','B','B','A','C','A','B','B']
df = pd.DataFrame(list(zip(x,y,cat)), columns =['x', 'y', 'cat'])
### Build color definitions
df.loc[:, 'color'] = df.cat
df.color.replace(['A', 'B', 'C'], ['red', 'blue', 'green'], inplace=True)
display(df)
### Plotting
fig = plt.figure(figsize=(5,5), constrained_layout=True)
gs = fig.add_gridspec(2, 1)
ax1 = fig.add_subplot(gs[0, 0])
ax1.scatter(df.x, df.y, edgecolors = 'none', c = df.color)
ax1.legend(loc='upper left', facecolor='white', frameon=1,
framealpha=1, labelspacing=0.2, borderpad=0.25)
It seems like there might not be a way to do this without a simple loop. Based on the procedure here, the following code works.
x = [0,1,-1,4,0,2,2,4,2]
y = [1,5,9,2,4,2,5,6,1]
cat = ['A','B','B','B','A','C','A','B','B']
df = pd.DataFrame(list(zip(x,y,cat)), columns =['x', 'y', 'cat'])
mycolorsdict = {'A':'red', 'B':'blue', 'C':'green'}
fig = plt.figure(figsize=(5,5), constrained_layout=True)
gs = fig.add_gridspec(2, 1)
ax1 = fig.add_subplot(gs[0, 0])
grouped = df.groupby('cat')
for key, group in grouped:
group.plot(ax=ax1, kind='scatter',
x='x', y='y',
label=key, color=mycolorsdict[key])
ax1.legend(loc='upper left', facecolor='white', frameon=1,
framealpha=1, labelspacing=0.2, borderpad=0.25)

Categorical plot of with data of multiple columns and their mean

I'd like to create a categorical plot of two pandas DataFrame columns a and b in the same figure with shared x and different y axis:
import pandas as pd
import seaborn as sns
example = [
('exp1','f0', 0.25, 2),
('exp1','f1', 0.5, 3),
('exp1','f2', 0.75, 4),
('exp2','f1', -0.25, 1),
('exp2','f2', 1, 2),
('exp2','f3', 0, 3)
]
df = pd.DataFrame(example, columns=['exp', 'split', 'a', 'b'])
mean_df = df.groupby('exp')['a'].mean()
g = sns.catplot(x='exp', y='a', data=df, jitter=False)
ax2 = plt.twinx()
sns.catplot(x='exp', y='b', data=df, jitter=False, ax=ax2)
In this implementation I have the problem that the colors are different for categories (x-values), not for the columns. Can I sole this or do I have to change the data structure?
I would also like to connect the means of the categorical values like in the image like this:
You may want to melt your data first:
data = df.melt(id_vars='exp', value_vars=['a','b'])
fig, ax = plt.subplots()
sns.scatterplot(data=data,
x='exp',
hue='variable',
y='value',
ax=ax)
(data.groupby(['exp','variable'])['value']
.mean()
.unstack('variable')
.plot(ax=ax, legend=False)
)
ax.set_xlim(-0.5, 1.5);
Output:
df = pd.DataFrame(example, columns=['exp', 'split', 'a', 'b'])
mean_df = df.groupby('exp').mean().reset_index()
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
sns.scatterplot(x='exp', y='a', data=df, color='C0', ax=ax1)
sns.scatterplot(x='exp', y='b', data=df, color='C1', ax=ax2)
sns.lineplot(x='exp',y='a', data=mean_df, color='C0', ax=ax1)
sns.lineplot(x='exp',y='b', data=mean_df, color='C1', ax=ax2)