Plot line plots of means of grouped boxplots in seaborn - dataframe

def get_stats_array(dataframe, Method, Cases, stat='mean', round_off=1):
mdf = dataframe
pdp = mdf[mdf['smoker'] == Cases]
if stat == 'mean':
means = pdp.groupby('day').mean()
return round(means,round_off)
elif stat == 'median':
medians = pdp.groupby('day').median()
return round(medians,round_off)
elif stat == 'min':
mins = pdp.groupby('day').min()
return mins
elif stat == 'max':
maxs = pdp.groupby('day').max()
return maxs
else:
return None
import seaborn as sns
sns.set_theme(style="ticks", palette="icefire")
tips = sns.load_dataset("tips")
Mean_Yes = get_stats_array(tips, Method=None, Cases='Yes', stat='mean', round_off=1)['total_bill']
Mean_No = get_stats_array(tips, Method=None, Cases='No', stat='mean', round_off=1)['total_bill']
Mean_array_Thu = [Mean_Yes[0] , Mean_No[0]]
Mean_array_Fri = [Mean_Yes[1] , Mean_No[1]]
Mean_array_Sat = [Mean_Yes[2] , Mean_No[2]]
Mean_array_Sun = [Mean_Yes[3] , Mean_No[3]]
CASES = ['Yes','No']
qf1 = pd.DataFrame([Mean_array_Thu], columns=CASES).assign(day='Thur')
qf2 = pd.DataFrame([Mean_array_Fri], columns=CASES).assign(day='Fri')
qf3 = pd.DataFrame([Mean_array_Sat], columns=CASES).assign(day='Sat')
qf4 = pd.DataFrame([Mean_array_Sun], columns=CASES).assign(day='Sun')
pdf = pd.concat([qf1, qf2, qf3, qf4])
pdf = pd.melt(pdf, id_vars=['day'], var_name=['CASES'])
plt.figure(1, figsize = (25,10))
ax = sns.boxplot(x="day", y="total_bill", hue="smoker", palette=["m", "g"], data=tips, showmeans=True)
sns.despine(offset=10, trim=True)
ax = sns.lineplot(x='day', y='value', data=pdf, style='CASES',ax=ax,color='black')
I was trying to plot a grouped box plot. I tried initially with an example available in seaborn. (Code attached) The data-format is as pandas dataFrame. First I plotted the box plot and then I tried to obtain the means of each group using groupby option but that somehow wasn't working. So I created another separate dataframe with the mean values and tried plotting those. I was able to get the separate line plots of the means of each group but the positioning was wrong. I have attached a figure of the resulting plot. I think there might be a better way to do this with groupby option but I am not sure how. But mainly I want the resulting line-plots of the means properly aligned with the means as well as the boxes. Can anyone help me with this? Please let me know if any more information has to be provided
Alignment issue with grouped-boxplot and line plot of means of each group as shown in the image below:

Related

xarray : how to stack several pcolormesh figures above a map?

For a ML project I'm currently on, I need to verify if the trained data are good or not.
Let's say that I'm "splitting" the sky into several altitude grids (let's take 3 values for the moment) and for a given region (let's say, Europe).
One grid could be a signal reception strength (RSSI), another one the signal quality (RSRQ)
Each cell of the grid is therefor a rectangle and it has a mean value of each measurement (i.e. RSSI or RSRQ) performed in that area.
I have hundreds of millions of data
In the code below, I know how to draw a coloured mesh with xarray for each altitude: I just use xr.plot.pcolormesh(lat,lon, the_data_set); that's fine
But this will only give me a "flat" figure like this:
RSSI value at 3 different altitudes
I need to draw all the pcolormesh() of a dataset for each altitude in such way that:
1: I can have the map at the bottom
2: Each pcolormesh() is stacked and "displayed" at its altitude
3: I need to add a 3d scatter plot for testing my trained data
4: Need to be interactive as I have to zoom in areas
For 2 and 3 above, I managed to do something using plt and cartopy :
enter image description here
But plt/cartopy combination is not as interactive as plotly.
But plotly doesn't have the pcolormesh functionality
And still ... I don't know in anycase, how to "stack" the pcolormesh results that I did get above.
I've been digging Internet for few days but I didn't find something that could satisfy all my criteria.
What I did to get my pcolormesh:
import numpy as np
import xarray as xr
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
class super_data():
def __init__(self, lon_bound,lat_bound,alt_bound,x_points,y_points,z_points):
self.lon_bound = lon_bound
self.lat_bound = lat_bound
self.alt_bound = alt_bound
self.x_points = x_points
self.y_points = y_points
self.z_points = z_points
self.lon, self.lat, self.alt = np.meshgrid(np.linspace(self.lon_bound[0], self.lon_bound[1], self.x_points),
np.linspace(self.lat_bound[0], self.lat_bound[1], self.y_points),
np.linspace(self.alt_bound[0], self.alt_bound[1], self.z_points))
self.this_xr = xr.Dataset(
coords={'lat': (('latitude', 'longitude','altitude'), self.lat),
'lon': (('latitude', 'longitude','altitude'), self.lon),
'alt': (('latitude', 'longitude','altitude'), self.alt)})
def add_data_array(self,ds_name,ds_min,ds_max):
def create_temp_data(ds_min,ds_max):
data = np.random.randint(ds_min,ds_max,size=self.y_points * self.x_points)
return data
temp_data = []
# Create "z_points" number of layers in the z axis
for i in range(self.z_points):
temp_data.append(create_temp_data(ds_min,ds_max))
data = np.concatenate(temp_data)
data = data.reshape(self.z_points,self.x_points, self.y_points)
self.this_xr[ds_name] = (("altitude","longitude","latitude"),data)
def plot(self,dataset, extent=None, plot_center=False):
# I want t
if np.sqrt(self.z_points) == np.floor(np.sqrt(self.z_points)):
side_size = int(np.sqrt(self.z_points))
else:
side_size = int(np.floor(np.sqrt(self.z_points) + 1))
fig = plt.figure()
i_ax=1
for i in range(side_size):
for j in range(side_size):
if i_ax < self.z_points+1:
this_dataset = self.this_xr[dataset].sel(altitude=i_ax-1)
# Initialize figure with subplots
ax = fig.add_subplot(side_size, side_size, i_ax, projection=ccrs.PlateCarree())
i_ax += 1
ax.coastlines()
this_dataset.plot.pcolormesh('lon', 'lat', ax=ax, infer_intervals=True, alpha=0.5)
else:
break
plt.tight_layout()
plt.show()
if __name__ == "__main__":
# Wanted coverage :
lons = [-15, 30]
lats = [35, 65]
alts = [1000, 5000]
xarr = super_data(lons,lats,alts,10,8,3)
# Add some fake data
xarr.add_data_array("RSSI",-120,-60)
xarr.add_data_array("pressure",700,1013)
xarr.plot("RSSI",0)
Thanks for you help

Using Python, How to Annotate Seaborn Correlation Heatmap with Significance Levels [duplicate]

I'm trying to do a nice correlation matrix heatmap in python, but I can't find the options to customize it the way I want.
My code is simply this one:
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(Correlazioni.corr(), dtype=np.bool))
heatmap = sns.heatmap(Correlazioni.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);
Now I would like to add (*) in significant cells: ( example: when the coefficient is higher or lower of a certain value)
Thank you very much for the answers, if I missed anything from my request, please let me know and I will provide it.
To show less cells, you can extend the mask, masking away the non-wanted values.
Instead of just setting annot=True, also a list of strings can be provided. You fully control how you format these strings, and e.g. append stars:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
np.random.seed(124)
Correlazioni = pd.DataFrame(np.random.rand(7, 10), columns=[*'abcdefghij'])
plt.figure(figsize=(16, 6))
corr = Correlazioni.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
cut_off = 0.6 # only show cells with abs(correlation) at least this value
extreme_1 = 0.75 # show with a star
extreme_2 = 0.85 # show with a second star
extreme_3 = 0.90 # show with a third star
mask |= np.abs(corr) < cut_off
corr = corr[~mask] # fill in NaN in the non-desired cells
remove_empty_rows_and_cols = True
if remove_empty_rows_and_cols:
wanted_cols = np.flatnonzero(np.count_nonzero(~mask, axis=1))
wanted_rows = np.flatnonzero(np.count_nonzero(~mask, axis=0))
corr = corr.iloc[wanted_cols, wanted_rows]
annot = [[f"{val:.4f}"
+ ('' if abs(val) < extreme_1 else '\n★') # add one star if abs(val) >= extreme_1
+ ('' if abs(val) < extreme_2 else '★') # add an extra star if abs(val) >= extreme_2
+ ('' if abs(val) < extreme_3 else '★') # add yet an extra star if abs(val) >= extreme_3
for val in row] for row in corr.to_numpy()]
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=annot, fmt='', cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize': 18}, pad=16)
plt.show()
Here is how it looks like with the empty rows and columns removed. Note that it doesn't look perfectly triangular anymore.

Series plot - Geopandas

I dont have a working code - but a snipet of my code can be as follows. I'm trying to use geopandas with mathplotlib, and trying to plot a map with links and points.
shape_file = os.path.join(os.getcwd(), "Healthboard")
healthboard = gp.read_file(os.path.join(shape_file, "healthboard.shp"))
healthboard = healthboard.to_crs({'init': 'epsg:4326'}) # re-projection
geo_df1 = geo_df1[geo_df1['HealthBoardArea2019Code'] == string1]
geo = geo_df[geo_df['Healthboard '] == string2]
new_shape_file = os.path.join(os.getcwd(), "Council_Shapefile")
council_to_healtboard = pd.read_csv("council_to_healthboard.csv")
council_to_healthboard = council_to_healtboard.rename(columns = {'CA': 'Council_area_code'})
council = gp.read_file(os.path.join(new_shape_file, "Council_shapefile.shp"))
council = council.to_crs({'init': 'epsg:4326'})
council = council.rename(columns = {'la_s_code':'Council_area_code'})
df = council.merge(council_to_healthboard, on = 'Council_area_code', how ='inner')
# Plotting stuff
fig, ax = plt.subplots(figsize=(15,15))
geo_df1.plot(ax = ax, markersize=35, color = "blue", marker = "*", label = "Postcode Sector")
geo.geometry.plot(ax = ax, color = "red", markersize=20, alpha = 0.8, label = 'SiteName')
#healthboard[healthboard["HBName"]=="Lothian"].plot(ax = ax, alpha = 0.6)
#healthboard[healthboard["HBName"]=="Lothian"].boundary.plot(ax = ax, color = "black", alpha = 0.6)
df[df["HB"]=="S08000024"].boundary.plot(ax =ax, color = "black", alpha = 0.1)
df[df["HB"]=="S08000024"].plot(ax =ax, cmap = "viridis", alpha = 0.1)
links_gp.plot(ax =ax, alpha = 0.25, color='brown', linestyle = "-")
My links_gp.plot has 40 time periods, as a result I want to make one plot, and have a button to adjust the parameters of time. Or if not possible a series of 40 plots. I've tried numerous ways but keep failing on this. I would really appreciate if someone could guide me on this.
I'm aware that you are using matplotlib, but if you don't mind using bokeh instead, you could use the following. To create an interactive plot with a possibility to adjust a parameter, bokeh provides a slider widget which can be used to change the plot based on a custom filter function.
An example from a geopandas dataframe with LineString geometries similar to the one you posted:
import geopandas as gpd
from bokeh.io import show, output_notebook
from bokeh.models import (CDSView, ColumnDataSource, CustomJS,
CustomJSFilter, Slider, Column)
from bokeh.layouts import column
from bokeh.plotting import figure
# prepare data source
links_gp['x'] = links_gp.apply(lambda row: list(row['geometry'].coords.xy[0]), axis=1)
links_gp['y'] = links_gp.apply(lambda row: list(row['geometry'].coords.xy[1]), axis=1)
# drop geometry column, because it can't be serialized to ColumnDataSource
links_gp.drop('geometry', axis=1, inplace=True)
linesource = ColumnDataSource(links_gp)
p = figure(title = 'Bokeh Time Slider',
plot_height = 500,
plot_width = 600,
toolbar_location = 'below',
tools = "pan, wheel_zoom, box_zoom, reset")
slider = Slider(title='Time Period', start=1, end=40, step=1, value=1)
# Callback triggers the filter when the slider moves
callback = CustomJS(args=dict(source=linesource),
code="""source.change.emit();""")
slider.js_on_change('value', callback)
# Custom filter that selects all lines of the time period based on the slider value
custom_filter = CustomJSFilter(args=dict(slider=slider),
code="""
var indices = [];
// iterate through rows of data source and check if time period value equals the slider value
for (var i = 0; i < source.get_length(); i++){
if (source.data['Time Period'][i] == slider.value){
indices.push(true);
} else {
indices.push(false);
}
}
return indices;
""")
# Use filter to determine which lines are visible
view = CDSView(source=linesource, filters=[custom_filter])
# plot lines to map
p.multi_line('x', 'y', source=linesource, color='red', line_width=3, view=view)
layout = column(p, slider)
show(layout)
This will be the result of the above code.

How can I enter input successfully after writing the plt.show() line in a while loop?

As a beginner in Python3, I’ve been following the Python Crash Course book. The following code prints the graph but does not ask for an input as intended. I have tried placing plt.show() after the if statement but the program ends up loading for a long time, eventually failing to display the graph. Is there a way to fix this?
Code:
import matplotlib.pyplot as plt
from random_walk import RandomWalk
while True:
rw = RandomWalk()
rw.fill_walk()
plt.style.use('classic')
fig, ax = plt.subplots()
point_numbers = range(rw.num_points)
ax.scatter(rw.x_values, rw.y_values,c=point_numbers, cmap=plt.cm.Blues, edgecolors='none', s=15)
plt.show()
The program does not print the following for input
keep_running = input("Make another walk? (y/n):")
if keep_running == 'n':
break
The following is saved on a separate file
from random import choice
class RandomWalk:
def __init__(self, num_points=5000):
"""Initialize attributes of a walk"""
self.num_points = num_points
self.x_values = [0]
self.y_values = [0]
def fill_walk(self):
""" calculating all the points in the walk"""
# Keep taking steps until the walk reaches the desired length.
while len(self.x_values) < self.num_points:
# Decide which direction to go and how far to go in that direction.
x_direction = choice([1,-1])
x_distance = choice([0,1,2,3,4])
x_step = x_direction*x_distance
y_direction = choice([1,-1])
y_distance = choice([0,1,2,3,4])
y_step = y_direction*y_distance
# Reject moves that go nowhere
if x_step == 0 and y_step == 0:
continue
# Calculate the new position
x = self.x_values[-1] + x_step
y = self.y_values[-1] + y_step
self.x_values.append(x)
self.y_values.append(y)

How to show precentage in Seaborn countplot [duplicate]

I was wondering if it is possible to create a Seaborn count plot, but instead of actual counts on the y-axis, show the relative frequency (percentage) within its group (as specified with the hue parameter).
I sort of fixed this with the following approach, but I can't imagine this is the easiest approach:
# Plot percentage of occupation per income class
grouped = df.groupby(['income'], sort=False)
occupation_counts = grouped['occupation'].value_counts(normalize=True, sort=False)
occupation_data = [
{'occupation': occupation, 'income': income, 'percentage': percentage*100} for
(income, occupation), percentage in dict(occupation_counts).items()
]
df_occupation = pd.DataFrame(occupation_data)
p = sns.barplot(x="occupation", y="percentage", hue="income", data=df_occupation)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
Result:
I'm using the well known adult data set from the UCI machine learning repository. The pandas dataframe is created like this:
# Read the adult dataset
df = pd.read_csv(
"data/adult.data",
engine='c',
lineterminator='\n',
names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income'],
header=None,
skipinitialspace=True,
na_values="?"
)
This question is sort of related, but does not make use of the hue parameter. And in my case I cannot just change the labels on the y-axis, because the height of the bar must depend on the group.
With newer versions of seaborn you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
(df
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
output
Update: Also show percentages on top of barplots
If you also want percentages, you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)
for p in g.ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
g.ax.text(txt_x,txt_y,txt)
I might be confused. The difference between your output and the output of
occupation_counts = (df.groupby(['income'])['occupation']
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
.sort_values('occupation'))
p = sns.barplot(x="occupation", y="percentage", hue="income", data=occupation_counts)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
is, it seems to me, only the order of the columns.
And you seem to care about that, since you pass sort=False. But then, in your code the order is determined uniquely by chance (and the order in which the dictionary is iterated even changes from run to run with Python 3.5).
You could do this with sns.histplot by setting the following properties:
stat = 'density' (this will make the y-axis the density rather than count)
common_norm = False (this will normalize each density independently)
See the simple example below:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(x = df['class'], hue=df['survived'], multiple="dodge",
stat = 'density', shrink = 0.8, common_norm=False)
You can use the library Dexplot to do counting as well as normalizing over any variable to get relative frequencies.
Pass the count function the name of the variable you would like to count and it will automatically produce a bar plot of the counts of all unique values. Use split to subdivide the counts by another variable. Notice that Dexplot automatically wraps the x-tick labels.
dxp.count('occupation', data=df, split='income')
Use the normalize parameter to normalize the counts over any variable (or combination of variables with a list). You can also use True to normalize over the grand total of counts.
dxp.count('occupation', data=df, split='income', normalize='income')
It boggled my mind that Seaborn doesn't provide anything like this out of the box.
Still, it was pretty easy to tweak the source code to get what you wanted.
The following code, with the function "percentageplot(x, hue, data)" works just like sns.countplot, but norms each bar per group (i.e. divides each green bar's value by the sum of all green bars)
In effect, it turns this (hard to interpret because different N of Apple vs. Android):
sns.countplot
into this (Normed so that bars reflect proportion of total for Apple, vs Android):
Percentageplot
Hope this helps!!
from seaborn.categorical import _CategoricalPlotter, remove_na
import matplotlib as mpl
class _CategoricalStatPlotter(_CategoricalPlotter):
#property
def nested_width(self):
"""A float with the width of plot elements when hue nesting is used."""
return self.width / len(self.hue_names)
def estimate_statistic(self, estimator, ci, n_boot):
if self.hue_names is None:
statistic = []
confint = []
else:
statistic = [[] for _ in self.plot_data]
confint = [[] for _ in self.plot_data]
for i, group_data in enumerate(self.plot_data):
# Option 1: we have a single layer of grouping
# --------------------------------------------
if self.plot_hues is None:
if self.plot_units is None:
stat_data = remove_na(group_data)
unit_data = None
else:
unit_data = self.plot_units[i]
have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1)
stat_data = group_data[have]
unit_data = unit_data[have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic.append(np.nan)
else:
statistic.append(estimator(stat_data, len(np.concatenate(self.plot_data))))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint.append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint.append(utils.ci(boots, ci))
# Option 2: we are grouping by a hue layer
# ----------------------------------------
else:
for j, hue_level in enumerate(self.hue_names):
if not self.plot_hues[i].size:
statistic[i].append(np.nan)
if ci is not None:
confint[i].append((np.nan, np.nan))
continue
hue_mask = self.plot_hues[i] == hue_level
group_total_n = (np.concatenate(self.plot_hues) == hue_level).sum()
if self.plot_units is None:
stat_data = remove_na(group_data[hue_mask])
unit_data = None
else:
group_units = self.plot_units[i]
have = pd.notnull(
np.c_[group_data, group_units]
).all(axis=1)
stat_data = group_data[hue_mask & have]
unit_data = group_units[hue_mask & have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic[i].append(np.nan)
else:
statistic[i].append(estimator(stat_data, group_total_n))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint[i].append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint[i].append(utils.ci(boots, ci))
# Save the resulting values for plotting
self.statistic = np.array(statistic)
self.confint = np.array(confint)
# Rename the value label to reflect the estimation
if self.value_label is not None:
self.value_label = "{}({})".format(estimator.__name__,
self.value_label)
def draw_confints(self, ax, at_group, confint, colors,
errwidth=None, capsize=None, **kws):
if errwidth is not None:
kws.setdefault("lw", errwidth)
else:
kws.setdefault("lw", mpl.rcParams["lines.linewidth"] * 1.8)
for at, (ci_low, ci_high), color in zip(at_group,
confint,
colors):
if self.orient == "v":
ax.plot([at, at], [ci_low, ci_high], color=color, **kws)
if capsize is not None:
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_low, ci_low], color=color, **kws)
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_high, ci_high], color=color, **kws)
else:
ax.plot([ci_low, ci_high], [at, at], color=color, **kws)
if capsize is not None:
ax.plot([ci_low, ci_low],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
ax.plot([ci_high, ci_high],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
class _BarPlotter(_CategoricalStatPlotter):
"""Show point estimates and confidence intervals with bars."""
def __init__(self, x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation, errcolor, errwidth=None,
capsize=None):
"""Initialize the plotter."""
self.establish_variables(x, y, hue, data, orient,
order, hue_order, units)
self.establish_colors(color, palette, saturation)
self.estimate_statistic(estimator, ci, n_boot)
self.errcolor = errcolor
self.errwidth = errwidth
self.capsize = capsize
def draw_bars(self, ax, kws):
"""Draw the bars onto `ax`."""
# Get the right matplotlib function depending on the orientation
barfunc = ax.bar if self.orient == "v" else ax.barh
barpos = np.arange(len(self.statistic))
if self.plot_hues is None:
# Draw the bars
barfunc(barpos, self.statistic, self.width,
color=self.colors, align="center", **kws)
# Draw the confidence intervals
errcolors = [self.errcolor] * len(barpos)
self.draw_confints(ax,
barpos,
self.confint,
errcolors,
self.errwidth,
self.capsize)
else:
for j, hue_level in enumerate(self.hue_names):
# Draw the bars
offpos = barpos + self.hue_offsets[j]
barfunc(offpos, self.statistic[:, j], self.nested_width,
color=self.colors[j], align="center",
label=hue_level, **kws)
# Draw the confidence intervals
if self.confint.size:
confint = self.confint[:, j]
errcolors = [self.errcolor] * len(offpos)
self.draw_confints(ax,
offpos,
confint,
errcolors,
self.errwidth,
self.capsize)
def plot(self, ax, bar_kws):
"""Make the plot."""
self.draw_bars(ax, bar_kws)
self.annotate_axes(ax)
if self.orient == "h":
ax.invert_yaxis()
def percentageplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None,
orient=None, color=None, palette=None, saturation=.75,
ax=None, **kwargs):
# Estimator calculates required statistic (proportion)
estimator = lambda x, y: (float(len(x))/y)*100
ci = None
n_boot = 0
units = None
errcolor = None
if x is None and y is not None:
orient = "h"
x = y
elif y is None and x is not None:
orient = "v"
y = x
elif x is not None and y is not None:
raise TypeError("Cannot pass values for both `x` and `y`")
else:
raise TypeError("Must pass values for either `x` or `y`")
plotter = _BarPlotter(x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation,
errcolor)
plotter.value_label = "Percentage"
if ax is None:
ax = plt.gca()
plotter.plot(ax, kwargs)
return ax
You can provide estimators for the height of the bar (along y axis) in a seaborn countplot by using the estimator keyword.
ax = sns.barplot(x="x", y="x", data=df, estimator=lambda x: len(x) / len(df) * 100)
The above code snippet is from https://github.com/mwaskom/seaborn/issues/1027
They have a whole discussion about how to provide percentages in a countplot. This answer is based off the same thread linked above.
In the context of your specific problem, you can probably do something like this:
ax = sb.barplot(x='occupation', y='some_numeric_column', data=raw_data, estimator=lambda x: len(x) / len(raw_data) * 100, hue='income')
ax.set(ylabel="Percent")
The above code worked for me (on a different dataset with different attributes). Note that you need to put in some numeric column for y else, it gives an error: "ValueError: Neither the x nor y variable appears to be numeric."
From this answer, and using "probability" worked best.
Taken from sns.histplot documentation on the "stat" parameter:
Aggregate statistic to compute in each bin.
count: show the number of observations in each bin
frequency: show the number of observations divided by the bin width
probability: or proportion: normalize such that bar heights sum to 1
percent: normalize such that bar heights sum to 100
density: normalize such that the total area of the histogram equals 1
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(
x = df['class'],
hue=df['survived'],
multiple="dodge",
stat = 'probability',
shrink = 0.5,
common_norm=False
)