How to show precentage in Seaborn countplot [duplicate] - pandas

I was wondering if it is possible to create a Seaborn count plot, but instead of actual counts on the y-axis, show the relative frequency (percentage) within its group (as specified with the hue parameter).
I sort of fixed this with the following approach, but I can't imagine this is the easiest approach:
# Plot percentage of occupation per income class
grouped = df.groupby(['income'], sort=False)
occupation_counts = grouped['occupation'].value_counts(normalize=True, sort=False)
occupation_data = [
{'occupation': occupation, 'income': income, 'percentage': percentage*100} for
(income, occupation), percentage in dict(occupation_counts).items()
]
df_occupation = pd.DataFrame(occupation_data)
p = sns.barplot(x="occupation", y="percentage", hue="income", data=df_occupation)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
Result:
I'm using the well known adult data set from the UCI machine learning repository. The pandas dataframe is created like this:
# Read the adult dataset
df = pd.read_csv(
"data/adult.data",
engine='c',
lineterminator='\n',
names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income'],
header=None,
skipinitialspace=True,
na_values="?"
)
This question is sort of related, but does not make use of the hue parameter. And in my case I cannot just change the labels on the y-axis, because the height of the bar must depend on the group.

With newer versions of seaborn you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
(df
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
output
Update: Also show percentages on top of barplots
If you also want percentages, you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)
for p in g.ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
g.ax.text(txt_x,txt_y,txt)

I might be confused. The difference between your output and the output of
occupation_counts = (df.groupby(['income'])['occupation']
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
.sort_values('occupation'))
p = sns.barplot(x="occupation", y="percentage", hue="income", data=occupation_counts)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
is, it seems to me, only the order of the columns.
And you seem to care about that, since you pass sort=False. But then, in your code the order is determined uniquely by chance (and the order in which the dictionary is iterated even changes from run to run with Python 3.5).

You could do this with sns.histplot by setting the following properties:
stat = 'density' (this will make the y-axis the density rather than count)
common_norm = False (this will normalize each density independently)
See the simple example below:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(x = df['class'], hue=df['survived'], multiple="dodge",
stat = 'density', shrink = 0.8, common_norm=False)

You can use the library Dexplot to do counting as well as normalizing over any variable to get relative frequencies.
Pass the count function the name of the variable you would like to count and it will automatically produce a bar plot of the counts of all unique values. Use split to subdivide the counts by another variable. Notice that Dexplot automatically wraps the x-tick labels.
dxp.count('occupation', data=df, split='income')
Use the normalize parameter to normalize the counts over any variable (or combination of variables with a list). You can also use True to normalize over the grand total of counts.
dxp.count('occupation', data=df, split='income', normalize='income')

It boggled my mind that Seaborn doesn't provide anything like this out of the box.
Still, it was pretty easy to tweak the source code to get what you wanted.
The following code, with the function "percentageplot(x, hue, data)" works just like sns.countplot, but norms each bar per group (i.e. divides each green bar's value by the sum of all green bars)
In effect, it turns this (hard to interpret because different N of Apple vs. Android):
sns.countplot
into this (Normed so that bars reflect proportion of total for Apple, vs Android):
Percentageplot
Hope this helps!!
from seaborn.categorical import _CategoricalPlotter, remove_na
import matplotlib as mpl
class _CategoricalStatPlotter(_CategoricalPlotter):
#property
def nested_width(self):
"""A float with the width of plot elements when hue nesting is used."""
return self.width / len(self.hue_names)
def estimate_statistic(self, estimator, ci, n_boot):
if self.hue_names is None:
statistic = []
confint = []
else:
statistic = [[] for _ in self.plot_data]
confint = [[] for _ in self.plot_data]
for i, group_data in enumerate(self.plot_data):
# Option 1: we have a single layer of grouping
# --------------------------------------------
if self.plot_hues is None:
if self.plot_units is None:
stat_data = remove_na(group_data)
unit_data = None
else:
unit_data = self.plot_units[i]
have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1)
stat_data = group_data[have]
unit_data = unit_data[have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic.append(np.nan)
else:
statistic.append(estimator(stat_data, len(np.concatenate(self.plot_data))))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint.append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint.append(utils.ci(boots, ci))
# Option 2: we are grouping by a hue layer
# ----------------------------------------
else:
for j, hue_level in enumerate(self.hue_names):
if not self.plot_hues[i].size:
statistic[i].append(np.nan)
if ci is not None:
confint[i].append((np.nan, np.nan))
continue
hue_mask = self.plot_hues[i] == hue_level
group_total_n = (np.concatenate(self.plot_hues) == hue_level).sum()
if self.plot_units is None:
stat_data = remove_na(group_data[hue_mask])
unit_data = None
else:
group_units = self.plot_units[i]
have = pd.notnull(
np.c_[group_data, group_units]
).all(axis=1)
stat_data = group_data[hue_mask & have]
unit_data = group_units[hue_mask & have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic[i].append(np.nan)
else:
statistic[i].append(estimator(stat_data, group_total_n))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint[i].append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint[i].append(utils.ci(boots, ci))
# Save the resulting values for plotting
self.statistic = np.array(statistic)
self.confint = np.array(confint)
# Rename the value label to reflect the estimation
if self.value_label is not None:
self.value_label = "{}({})".format(estimator.__name__,
self.value_label)
def draw_confints(self, ax, at_group, confint, colors,
errwidth=None, capsize=None, **kws):
if errwidth is not None:
kws.setdefault("lw", errwidth)
else:
kws.setdefault("lw", mpl.rcParams["lines.linewidth"] * 1.8)
for at, (ci_low, ci_high), color in zip(at_group,
confint,
colors):
if self.orient == "v":
ax.plot([at, at], [ci_low, ci_high], color=color, **kws)
if capsize is not None:
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_low, ci_low], color=color, **kws)
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_high, ci_high], color=color, **kws)
else:
ax.plot([ci_low, ci_high], [at, at], color=color, **kws)
if capsize is not None:
ax.plot([ci_low, ci_low],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
ax.plot([ci_high, ci_high],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
class _BarPlotter(_CategoricalStatPlotter):
"""Show point estimates and confidence intervals with bars."""
def __init__(self, x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation, errcolor, errwidth=None,
capsize=None):
"""Initialize the plotter."""
self.establish_variables(x, y, hue, data, orient,
order, hue_order, units)
self.establish_colors(color, palette, saturation)
self.estimate_statistic(estimator, ci, n_boot)
self.errcolor = errcolor
self.errwidth = errwidth
self.capsize = capsize
def draw_bars(self, ax, kws):
"""Draw the bars onto `ax`."""
# Get the right matplotlib function depending on the orientation
barfunc = ax.bar if self.orient == "v" else ax.barh
barpos = np.arange(len(self.statistic))
if self.plot_hues is None:
# Draw the bars
barfunc(barpos, self.statistic, self.width,
color=self.colors, align="center", **kws)
# Draw the confidence intervals
errcolors = [self.errcolor] * len(barpos)
self.draw_confints(ax,
barpos,
self.confint,
errcolors,
self.errwidth,
self.capsize)
else:
for j, hue_level in enumerate(self.hue_names):
# Draw the bars
offpos = barpos + self.hue_offsets[j]
barfunc(offpos, self.statistic[:, j], self.nested_width,
color=self.colors[j], align="center",
label=hue_level, **kws)
# Draw the confidence intervals
if self.confint.size:
confint = self.confint[:, j]
errcolors = [self.errcolor] * len(offpos)
self.draw_confints(ax,
offpos,
confint,
errcolors,
self.errwidth,
self.capsize)
def plot(self, ax, bar_kws):
"""Make the plot."""
self.draw_bars(ax, bar_kws)
self.annotate_axes(ax)
if self.orient == "h":
ax.invert_yaxis()
def percentageplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None,
orient=None, color=None, palette=None, saturation=.75,
ax=None, **kwargs):
# Estimator calculates required statistic (proportion)
estimator = lambda x, y: (float(len(x))/y)*100
ci = None
n_boot = 0
units = None
errcolor = None
if x is None and y is not None:
orient = "h"
x = y
elif y is None and x is not None:
orient = "v"
y = x
elif x is not None and y is not None:
raise TypeError("Cannot pass values for both `x` and `y`")
else:
raise TypeError("Must pass values for either `x` or `y`")
plotter = _BarPlotter(x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation,
errcolor)
plotter.value_label = "Percentage"
if ax is None:
ax = plt.gca()
plotter.plot(ax, kwargs)
return ax

You can provide estimators for the height of the bar (along y axis) in a seaborn countplot by using the estimator keyword.
ax = sns.barplot(x="x", y="x", data=df, estimator=lambda x: len(x) / len(df) * 100)
The above code snippet is from https://github.com/mwaskom/seaborn/issues/1027
They have a whole discussion about how to provide percentages in a countplot. This answer is based off the same thread linked above.
In the context of your specific problem, you can probably do something like this:
ax = sb.barplot(x='occupation', y='some_numeric_column', data=raw_data, estimator=lambda x: len(x) / len(raw_data) * 100, hue='income')
ax.set(ylabel="Percent")
The above code worked for me (on a different dataset with different attributes). Note that you need to put in some numeric column for y else, it gives an error: "ValueError: Neither the x nor y variable appears to be numeric."

From this answer, and using "probability" worked best.
Taken from sns.histplot documentation on the "stat" parameter:
Aggregate statistic to compute in each bin.
count: show the number of observations in each bin
frequency: show the number of observations divided by the bin width
probability: or proportion: normalize such that bar heights sum to 1
percent: normalize such that bar heights sum to 100
density: normalize such that the total area of the histogram equals 1
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(
x = df['class'],
hue=df['survived'],
multiple="dodge",
stat = 'probability',
shrink = 0.5,
common_norm=False
)

Related

Matplotlib: different scale on negative side of the axis

Background
I am trying to show three variables on a single plot. I have connected the three points using lines of different colours based on some other variables. This is shown here
Problem
What I want to do is to have a different scale on the negative x-axis. This would help me in providing positive x_ticks, different axis label and also clear and uncluttered representation of the lines on left side of the image
Question
How to have a different positive x-axis starting from 0 towards negative direction?
Have xticks based on data plotted in that direction
Have a separate xlabel for this new axis
Additional information
I have checked other questions regarding inclusion of multiple axes e.g. this and this. However, these questions did not serve the purpose.
Code Used
font_size = 20
plt.rcParams.update({'font.size': font_size})
fig = plt.figure()
ax = fig.add_subplot(111)
#read my_data from file or create it
for case in my_data:
#Iterating over my_data
if condition1 == True:
local_linestyle = '-'
local_color = 'r'
local_line_alpha = 0.6
elif condition2 == 1:
local_linestyle = '-'
local_color = 'b'
local_line_alpha = 0.6
else:
local_linestyle = '--'
local_color = 'g'
local_line_alpha = 0.6
datapoint = [case[0], case[1], case[2]]
plt.plot(datapoint[0], 0, color=local_color)
plt.plot(-datapoint[2], 0, color=local_color)
plt.plot(0, datapoint[1], color=local_color)
plt.plot([datapoint[0], 0], [0, datapoint[1]], linestyle=local_linestyle, color=local_color)
plt.plot([-datapoint[2], 0], [0, datapoint[1]], linestyle=local_linestyle, color=local_color)
plt.show()
exit()
You can define a custom scale, where values below zero are scaled differently than those above zero.
import numpy as np
from matplotlib import scale as mscale
from matplotlib import transforms as mtransforms
from matplotlib.ticker import FuncFormatter
class AsymScale(mscale.ScaleBase):
name = 'asym'
def __init__(self, axis, **kwargs):
mscale.ScaleBase.__init__(self)
self.a = kwargs.get("a", 1)
def get_transform(self):
return self.AsymTrans(self.a)
def set_default_locators_and_formatters(self, axis):
# possibly, set a different locator and formatter here.
fmt = lambda x,pos: "{}".format(np.abs(x))
axis.set_major_formatter(FuncFormatter(fmt))
class AsymTrans(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
def __init__(self, a):
mtransforms.Transform.__init__(self)
self.a = a
def transform_non_affine(self, x):
return (x >= 0)*x + (x < 0)*x*self.a
def inverted(self):
return AsymScale.InvertedAsymTrans(self.a)
class InvertedAsymTrans(AsymTrans):
def transform_non_affine(self, x):
return (x >= 0)*x + (x < 0)*x/self.a
def inverted(self):
return AsymScale.AsymTrans(self.a)
Using this you would provide a scale parameter a that scales the negative part of the axes.
# Now that the Scale class has been defined, it must be registered so
# that ``matplotlib`` can find it.
mscale.register_scale(AsymScale)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot([-2, 0, 5], [0,1,0])
ax.set_xscale("asym", a=2)
ax.annotate("negative axis", xy=(.25,0), xytext=(0,-30),
xycoords = "axes fraction", textcoords="offset points", ha="center")
ax.annotate("positive axis", xy=(.75,0), xytext=(0,-30),
xycoords = "axes fraction", textcoords="offset points", ha="center")
plt.show()
The question is not very clear about what xticks and labels are desired, so I left that out for now.
Here's how to get what you want. This solution uses two twined axes object to get different scaling to the left and right of the origin, and then hides all the evidence:
import matplotlib.pyplot as plt
import matplotlib as mpl
from numbers import Number
tickkwargs = {m+k:False for k in ('bottom','top','left','right') for m in ('','label')}
p = np.zeros((10, 3, 2))
p[:,0,0] -= np.arange(10)*.1 + .5
p[:,1,1] += np.repeat(np.arange(5), 2)*.1 + .3
p[:,2,0] += np.arange(10)*.5 + 2
fig = plt.figure(figsize=(8,6))
host = fig.add_subplot(111)
par = host.twiny()
host.set_xlim(-6, 6)
par.set_xlim(-1, 1)
for ps in p:
# mask the points with negative x values
ppos = ps[ps[:,0] >= 0].T
host.plot(*ppos)
# mask the points with positive x values
pneg = ps[ps[:,0] <= 0].T
par.plot(*pneg)
# hide all possible ticks/notation text that could be set by the second x axis
par.tick_params(axis="both", **tickkwargs)
par.xaxis.get_offset_text().set_visible(False)
# fix the x tick labels so they're all positive
host.set_xticklabels(np.abs(host.get_xticks()))
fig.show()
Output:
Here's what the set of points p I used in the code above look like when plotted normally:
fig = plt.figure(figsize=(8,6))
ax = fig.gca()
for ps in p:
ax.plot(*ps.T)
fig.show()
Output:
The method of deriving a class of mscale.ScaleBase as shown in other answers may be too complicated for your purpose.
You can pass two scale transform functions to set_xscale or set_yscale, something like the following.
def get_scale(a=1): # a is the scale of your negative axis
def forward(x):
x = (x >= 0) * x + (x < 0) * x * a
return x
def inverse(x):
x = (x >= 0) * x + (x < 0) * x / a
return x
return forward, inverse
fig, ax = plt.subplots()
forward, inverse = get_scale(a=3)
ax.set_xscale('function', functions=(forward, inverse)) # this is for setting x axis
# do plotting
More examples can be found in this doc.

Separate halves of split violinplot to compare tail data

Is there a way to physically separate the two halves of a "split" seaborn violinplot (or other type of violinplot)? I'm trying to compare two different treatments, but there is a skinny tail, and it's difficult (impossible) to tell whether one or both halves of the split violin go up all the way to the tip of the tail.
One thought I had was that if the two halves were slightly separated instead of right up next to each other, then it would be easy to absorb the data accurately.
Here is my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
# load data into a dataframe
df1 = pd.read_excel('Modeling analysis charts.xlsx',
sheetname='lmps',
parse_cols=[0,5],
skiprows=0,
header=1)
# identify which dispatch run this data is from
df1['Run']='Scheduling'
# load data into a dataframe
df2 = pd.read_excel('Modeling analysis charts.xlsx',
sheetname='lmps',
parse_cols=[7,12],
skiprows=0,
header=1)
# identify which dispatch run this data is from
df2['Run']='Pricing'
# drop rows with missing data
df1 = df1.dropna(how='any')
df2 = df2.dropna(how='any')
# merge data from different runs
df = pd.concat([df1,df2])
# LMPs are all opposite of actual values, so correct that
df['LMP'] = -df['LMP']
fontsize = 10
style.use('fivethirtyeight')
fig, axes = plt.subplots()
sns.violinplot(x='Scenario', y='LMP', hue='Run', split=True, data=df, inner=None, scale='area', bw=0.2, cut=0, linewidth=0.5, ax = axes)
axes.set_title('Day Ahead Market')
#axes.set_ylim([-15,90])
axes.yaxis.grid(True)
axes.set_xlabel('Scenario')
axes.set_ylabel('LMP ($/MWh)')
#plt.savefig('DAMarket.pdf', bbox_inches='tight')
plt.show()
EDIT: For historical reasons this is the accepted answer, but have a look at #conchoecia more recent and much cleaner implementation.
Cool idea. The basic idea of my implementation is to draw the whole thing, grab the patches corresponding to the two half-violins, and then shift paths of those patches left or right. Code is hopefully self-explanatory, otherwise let me know in the comments.
import numpy as np
import matplotlib.pyplot as plt;
import matplotlib.collections
import seaborn as sns
import pandas as pd
# create some data
n = 10000 # number of samples
c = 5 # classes
y = np.random.randn(n)
x = np.random.randint(0, c, size=n)
z = np.random.rand(n) > 0.5 # sub-class
data = pd.DataFrame(dict(x=x, y=y, z=z))
# initialise new axis;
# if there is random other crap on the axis (e.g. a previous plot),
# the hacky code below won't work
fig, ax = plt.subplots(1,1)
# plot
inner = None # Note: 'box' is default
ax = sns.violinplot(data=data, x='x', y='y', hue='z', split=True, inner=inner, ax=ax)
# offset stuff
delta = 0.02
for ii, item in enumerate(ax.collections):
# axis contains PolyCollections and PathCollections
if isinstance(item, matplotlib.collections.PolyCollection):
# get path
path, = item.get_paths()
vertices = path.vertices
# shift x-coordinates of path
if not inner:
if ii % 2: # -> to right
vertices[:,0] += delta
else: # -> to left
vertices[:,0] -= delta
else: # inner='box' adds another type of PollyCollection
if ii % 3 == 0:
vertices[:,0] -= delta
elif ii % 3 == 1:
vertices[:,0] += delta
else: # ii % 3 = 2
pass
I expanded on #Paul's answer above and made it more robust. It now supports both vertical and horizontal orientation, and I implemented it to work with inner='sticks' since that fits my application.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.collections
import seaborn as sns
import pandas as pd
def offset_violinplot_halves(ax, delta, width, inner, direction):
"""
This function offsets the halves of a violinplot to compare tails
or to plot something else in between them. This is specifically designed
for violinplots by Seaborn that use the option `split=True`.
For lines, this works on the assumption that Seaborn plots everything with
integers as the center.
Args:
<ax> The axis that contains the violinplots.
<delta> The amount of space to put between the two halves of the violinplot
<width> The total width of the violinplot, as passed to sns.violinplot()
<inner> The type of inner in the seaborn
<direction> Orientation of violinplot. 'hotizontal' or 'vertical'.
Returns:
- NA, modifies the <ax> directly
"""
# offset stuff
if inner == 'sticks':
lines = ax.get_lines()
for line in lines:
if direction == 'horizontal':
data = line.get_ydata()
print(data)
if int(data[0] + 1)/int(data[1] + 1) < 1:
# type is top, move neg, direction backwards for horizontal
data -= delta
else:
# type is bottom, move pos, direction backward for hori
data += delta
line.set_ydata(data)
elif direction == 'vertical':
data = line.get_xdata()
print(data)
if int(data[0] + 1)/int(data[1] + 1) < 1:
# type is left, move neg
data -= delta
else:
# type is left, move pos
data += delta
line.set_xdata(data)
for ii, item in enumerate(ax.collections):
# axis contains PolyCollections and PathCollections
if isinstance(item, matplotlib.collections.PolyCollection):
# get path
path, = item.get_paths()
vertices = path.vertices
half_type = _wedge_dir(vertices, direction)
# shift x-coordinates of path
if half_type in ['top','bottom']:
if inner in ["sticks", None]:
if half_type == 'top': # -> up
vertices[:,1] -= delta
elif half_type == 'bottom': # -> down
vertices[:,1] += delta
elif half_type in ['left', 'right']:
if inner in ["sticks", None]:
if half_type == 'left': # -> left
vertices[:,0] -= delta
elif half_type == 'right': # -> down
vertices[:,0] += delta
def _wedge_dir(vertices, direction):
"""
Args:
<vertices> The vertices from matplotlib.collections.PolyCollection
<direction> Direction must be 'horizontal' or 'vertical' according to how
your plot is laid out.
Returns:
- a string in ['top', 'bottom', 'left', 'right'] that determines where the
half of the violinplot is relative to the center.
"""
if direction == 'horizontal':
result = (direction, len(set(vertices[1:5,1])) == 1)
elif direction == 'vertical':
result = (direction, len(set(vertices[-3:-1,0])) == 1)
outcome_key = {('horizontal', True): 'bottom',
('horizontal', False): 'top',
('vertical', True): 'left',
('vertical', False): 'right'}
# if the first couple x/y values after the start are the same, it
# is the input direction. If not, it is the opposite
return outcome_key[result]
# create some data
n = 100 # number of samples
c = ['cats', 'rats', 'bears', 'pears', 'snares'] # classes
y = np.random.randn(n)
x = np.random.choice(c, size=n)
z = np.random.rand(n) > 0.5 # sub-class
data = pd.DataFrame(dict(x=x, y=y, z=z))
print('done making data')
# initialise new axes;
fig, (ax1, ax2) = plt.subplots(2)
inner = "sticks" # Note: 'box' is default
width = 0.75
delta = 0.05
final_width = width - delta
print(data)
sns.violinplot(data=data, x='y', y='x',
split=True, hue = 'z',
ax = ax1, inner='sticks',
bw = 0.2)
sns.violinplot(data=data, x='x', y='y',
split=True, hue = 'z',
ax = ax2, inner='sticks',
bw = 0.2)
offset_violinplot_halves(ax1, delta, final_width, inner, 'horizontal')
offset_violinplot_halves(ax2, delta, final_width, inner, 'vertical')
plt.show()

Discrete Color Bar with Tick labels in between colors

I am trying to plot some data with a discrete color bar. I was following the example given (https://gist.github.com/jakevdp/91077b0cae40f8f8244a) but the issue is this example does not work 1-1 with different spacing. For example, the spacing in the example in the link is for only increasing by 1 but my data is increasing by 0.5. You can see the output from the code I have.. Any help with this would be appreciated. I know I am missing something key here but cant figure it out.
import matplotlib.pylab as plt
import numpy as np
def discrete_cmap(N, base_cmap=None):
"""Create an N-bin discrete colormap from the specified input map"""
# Note that if base_cmap is a string or None, you can simply do
# return plt.cm.get_cmap(base_cmap, N)
# The following works for string, None, or a colormap instance:
base = plt.cm.get_cmap(base_cmap)
color_list = base(np.linspace(0, 1, N))
cmap_name = base.name + str(N)
return base.from_list(cmap_name, color_list, N)
num=11
x = np.random.randn(40)
y = np.random.randn(40)
c = np.random.randint(num, size=40)
plt.figure(figsize=(10,7.5))
plt.scatter(x, y, c=c, s=50, cmap=discrete_cmap(num, 'jet'))
plt.colorbar(ticks=np.arange(0,5.5,0.5))
plt.clim(-0.5, num - 0.5)
plt.show()
Not sure what version of matplotlib/pyplot introduced this, but plt.get_cmap now supports an int argument specifying the number of colors you want to get, for discrete colormaps.
This automatically results in the colorbar being discrete.
By the way, pandas has an even better handling of the colorbar.
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')
# remove if not using Jupyter/IPython
%matplotlib inline
# choose number of clusters and number of points in each cluster
n_clusters = 5
n_samples = 20
# there are fancier ways to do this
clusters = np.array([k for k in range(n_clusters) for i in range(n_samples)])
# generate the coordinates of the center
# of each cluster by shuffling a range of values
clusters_x = np.arange(n_clusters)
clusters_y = np.arange(n_clusters)
np.random.shuffle(clusters_x)
np.random.shuffle(clusters_y)
# get dicts like cluster -> center coordinate
x_dict = dict(enumerate(clusters_x))
y_dict = dict(enumerate(clusters_y))
# get coordinates of cluster center for each point
x = np.array(list(x_dict[k] for k in clusters)).astype(float)
y = np.array(list(y_dict[k] for k in clusters)).astype(float)
# add noise
x += np.random.normal(scale=0.5, size=n_clusters*n_samples)
y += np.random.normal(scale=0.5, size=n_clusters*n_samples)
### Finally, plot
fig, ax = plt.subplots(figsize=(12,8))
# get discrete colormap
cmap = plt.get_cmap('viridis', n_clusters)
# scatter points
scatter = ax.scatter(x, y, c=clusters, cmap=cmap)
# scatter cluster centers
ax.scatter(clusters_x, clusters_y, c='red')
# add colorbar
cbar = plt.colorbar(scatter)
# set ticks locations (not very elegant, but it works):
# - shift by 0.5
# - scale so that the last value is at the center of the last color
tick_locs = (np.arange(n_clusters) + 0.5)*(n_clusters-1)/n_clusters
cbar.set_ticks(tick_locs)
# set tick labels (as before)
cbar.set_ticklabels(np.arange(n_clusters))
Ok so this is the hack I found for my own question. I am sure there is a better way to do this but this works for what I am doing. Feel free to suggest a better way to do this.
import numpy as np
import matplotlib.pylab as plt
def discrete_cmap(N, base_cmap=None):
"""Create an N-bin discrete colormap from the specified input map"""
# Note that if base_cmap is a string or None, you can simply do
# return plt.cm.get_cmap(base_cmap, N)
# The following works for string, None, or a colormap instance:
base = plt.cm.get_cmap(base_cmap)
color_list = base(np.linspace(0, 1, N))
cmap_name = base.name + str(N)
return base.from_list(cmap_name, color_list, N)
num=11
plt.figure(figsize=(10,7.5))
x = np.random.randn(40)
y = np.random.randn(40)
c = np.random.randint(num, size=40)
plt.scatter(x, y, c=c, s=50, cmap=discrete_cmap(num, 'jet'))
cbar=plt.colorbar(ticks=range(num))
plt.clim(-0.5, num - 0.5)
cbar.ax.set_yticklabels(np.arange(0.0,5.5,0.5))
plt.show()
For some reason I cannot upload the image associated with the code above. I get an error when uploading so not sure how to show the final example. But simply I set the color bar axes for tick labels for a vertical color bar and passed in the labels I want and it produced the correct output.

Matplotlib/pyplot: Auto adjust unit of y Axis

I would like to modify the Y axis unit of the plot indicated below. Preferable would be the use of units like M (Million), k (Thousand) for large numbers. For example, the y Axis should look like: 50k, 100k, 150k, etc.
The plot below is generated by the following code snippet:
plt.autoscale(enable=True, axis='both')
plt.title("TTL Distribution")
plt.xlabel('TTL Value')
plt.ylabel('Number of Packets')
y = graphy # data from a sqlite query
x = graphx # data from a sqlite query
width = 0.5
plt.bar(x, y, width, align='center', linewidth=2, color='red', edgecolor='red')
fig = plt.gcf()
plt.show()
I saw this post and thought I could write my own formatting function:
def y_fmt(x, y):
if max_y > 1000000:
val = int(y)/1000000
return '{:d} M'.format(val)
elif max_y > 1000:
val = int(y) / 1000
return '{:d} k'.format(val)
else:
return y
But I missed that there is no plt.yaxis.set_major_formatter(tick.FuncFormatter(y_fmt)) function available for the bar plot I am using.
How I can achieve a better formatting of the Y axis?
[]
In principle there is always the option to set custom labels via plt.gca().yaxis.set_xticklabels().
However, I'm not sure why there shouldn't be the possibility to use matplotlib.ticker.FuncFormatter here. The FuncFormatter is designed for exactly the purpose of providing custom ticklabels depending on the ticklabel's position and value.
There is actually a nice example in the matplotlib example collection.
In this case we can use the FuncFormatter as desired to provide unit prefixes as suffixes on the axes of a matplotlib plot. To this end, we iterate over the multiples of 1000 and check if the value to be formatted exceeds it. If the value is then a whole number, we can format it as integer with the respective unit symbol as suffix. On the other hand, if there is a remainder behind the decimal point, we check how many decimal places are needed to format this number.
Here is a complete example:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
def y_fmt(y, pos):
decades = [1e9, 1e6, 1e3, 1e0, 1e-3, 1e-6, 1e-9 ]
suffix = ["G", "M", "k", "" , "m" , "u", "n" ]
if y == 0:
return str(0)
for i, d in enumerate(decades):
if np.abs(y) >=d:
val = y/float(d)
signf = len(str(val).split(".")[1])
if signf == 0:
return '{val:d} {suffix}'.format(val=int(val), suffix=suffix[i])
else:
if signf == 1:
print val, signf
if str(val).split(".")[1] == "0":
return '{val:d} {suffix}'.format(val=int(round(val)), suffix=suffix[i])
tx = "{"+"val:.{signf}f".format(signf = signf) +"} {suffix}"
return tx.format(val=val, suffix=suffix[i])
#return y
return y
fig, ax = plt.subplots(ncols=3, figsize=(10,5))
x = np.linspace(0,349,num=350)
y = np.sinc((x-66.)/10.3)**2*1.5e6+np.sinc((x-164.)/8.7)**2*660000.+np.random.rand(len(x))*76000.
width = 1
ax[0].bar(x, y, width, align='center', linewidth=2, color='red', edgecolor='red')
ax[0].yaxis.set_major_formatter(FuncFormatter(y_fmt))
ax[1].bar(x[::-1], y*(-0.8e-9), width, align='center', linewidth=2, color='orange', edgecolor='orange')
ax[1].yaxis.set_major_formatter(FuncFormatter(y_fmt))
ax[2].fill_between(x, np.sin(x/100.)*1.7+100010, np.cos(x/100.)*1.7+100010, linewidth=2, color='#a80975', edgecolor='#a80975')
ax[2].yaxis.set_major_formatter(FuncFormatter(y_fmt))
for axes in ax:
axes.set_title("TTL Distribution")
axes.set_xlabel('TTL Value')
axes.set_ylabel('Number of Packets')
axes.set_xlim([x[0], x[-1]+1])
plt.show()
which provides the following plot:
You were pretty close; one (possibly) confusing thing about FuncFormatter is that the first argument is the tick value, and the second the tick position , which (when named x,y) can be confusing for the y-axis. For clarity, I renamed them in the example below.
The function should take in two inputs (tick value x and position pos) and return a string
(http://matplotlib.org/api/ticker_api.html#matplotlib.ticker.FuncFormatter)
Working example:
import numpy as np
import matplotlib.pylab as pl
import matplotlib.ticker as tick
def y_fmt(tick_val, pos):
if tick_val > 1000000:
val = int(tick_val)/1000000
return '{:d} M'.format(val)
elif tick_val > 1000:
val = int(tick_val) / 1000
return '{:d} k'.format(val)
else:
return tick_val
x = np.arange(300)
y = np.random.randint(0,2000000,x.size)
width = 0.5
pl.bar(x, y, width, align='center', linewidth=2, color='red', edgecolor='red')
pl.xlim(0,300)
ax = pl.gca()
ax.yaxis.set_major_formatter(tick.FuncFormatter(y_fmt))

How to color bars who make up 50% of the data?

I am plotting a histogram for some data points with bar heights being the percentage of that bin from the whole data:
x = normal(size=1000)
hist, bins = np.histogram(x, bins=20)
plt.bar(bins[:-1], hist.astype(np.float32) / hist.sum(), width=(bins[1]-bins[0]), alpha=0.6)
The result is:
I would like all bars that sum up to be 50% of the data to be in a different color, for example:
(I selected the colored bars without actually checking whether their sum adds to 50%)
Any suggestions how to accomplish this?
Here is how you can plot the first half of the bins with a different color, this looks like your mock, but I am not sure it complies to %50 of the data (it is not clear to me what do you mean by that).
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)
fig = plt.figure()
ax = fig.add_subplot(111)
# the histogram of the data
n, bins, patches = ax.hist(x, 50, normed=1, facecolor='green', alpha=0.75)
# now that we found the index we color all the beans smaller than middle index
for p in patches[:len(bins)/2]:
p.set_facecolor('red')
# hist uses np.histogram under the hood to create 'n' and 'bins'.
# np.histogram returns the bin edges, so there will be 50 probability
# density values in n, 51 bin edges in bins and 50 patches. To get
# everything lined up, we'll compute the bin centers
bincenters = 0.5*(bins[1:]+bins[:-1])
# add a 'best fit' line for the normal PDF
y = mlab.normpdf( bincenters, mu, sigma)
l = ax.plot(bincenters, y, 'r--', linewidth=1)
ax.set_xlabel('Smarts')
ax.set_ylabel('Probability')
ax.set_xlim(40, 160)
ax.set_ylim(0, 0.03)
ax.grid(True)
plt.show()
And the output is:
update
The key method you want to look at is patch.set_set_facecolor. You have to understand that almost everything you plot inside the axes object is a Patch, and as such it has this method, here is another example, I arbitrary choose the first 3 bars to have another color, you can choose based on what ever you decide:
import numpy as np
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
## the data
N = 5
menMeans = [18, 35, 30, 35, 27]
## necessary variables
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
## the bars
rects1 = ax.bar(ind, menMeans, width,
color='black',
error_kw=dict(elinewidth=2,ecolor='red'))
for patch in rects1.patches[:3]:
patch.set_facecolor('red')
ax.set_xlim(-width,len(ind)+width)
ax.set_ylim(0,45)
ax.set_ylabel('Scores')
xTickMarks = ['Group'+str(i) for i in range(1,6)]
ax.set_xticks(ind)
xtickNames = ax.set_xticklabels(xTickMarks)
plt.setp(xtickNames, rotation=45, fontsize=10)
plt.show()