Show confidence limits and prediction limits in scatter plot - numpy

I have two arrays of data for height and weight:
import numpy as np, matplotlib.pyplot as plt
heights = np.array([50,52,53,54,58,60,62,64,66,67,68,70,72,74,76,55,50,45,65])
weights = np.array([25,50,55,75,80,85,50,65,85,55,45,45,50,75,95,65,50,40,45])
plt.plot(heights,weights,'bo')
plt.show()
How can I produce a plot similar to the following?

Here's what I put together. I tried to closely emulate your screenshot.
Given
import numpy as np
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
# Raw Data
heights = np.array([50,52,53,54,58,60,62,64,66,67,68,70,72,74,76,55,50,45,65])
weights = np.array([25,50,55,75,80,85,50,65,85,55,45,45,50,75,95,65,50,40,45])
Two detailed options to plot confidence intervals:
def plot_ci_manual(t, s_err, n, x, x2, y2, ax=None):
"""Return an axes of confidence bands using a simple approach.
Notes
-----
.. math:: \left| \: \hat{\mu}_{y|x0} - \mu_{y|x0} \: \right| \; \leq \; T_{n-2}^{.975} \; \hat{\sigma} \; \sqrt{\frac{1}{n}+\frac{(x_0-\bar{x})^2}{\sum_{i=1}^n{(x_i-\bar{x})^2}}}
.. math:: \hat{\sigma} = \sqrt{\sum_{i=1}^n{\frac{(y_i-\hat{y})^2}{n-2}}}
References
----------
.. [1] M. Duarte. "Curve fitting," Jupyter Notebook.
http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/CurveFitting.ipynb
"""
if ax is None:
ax = plt.gca()
ci = t * s_err * np.sqrt(1/n + (x2 - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
ax.fill_between(x2, y2 + ci, y2 - ci, color="#b9cfe7", edgecolor="")
return ax
def plot_ci_bootstrap(xs, ys, resid, nboot=500, ax=None):
"""Return an axes of confidence bands using a bootstrap approach.
Notes
-----
The bootstrap approach iteratively resampling residuals.
It plots `nboot` number of straight lines and outlines the shape of a band.
The density of overlapping lines indicates improved confidence.
Returns
-------
ax : axes
- Cluster of lines
- Upper and Lower bounds (high and low) (optional) Note: sensitive to outliers
References
----------
.. [1] J. Stults. "Visualizing Confidence Intervals", Various Consequences.
http://www.variousconsequences.com/2010/02/visualizing-confidence-intervals.html
"""
if ax is None:
ax = plt.gca()
bootindex = sp.random.randint
for _ in range(nboot):
resamp_resid = resid[bootindex(0, len(resid) - 1, len(resid))]
# Make coeffs of for polys
pc = sp.polyfit(xs, ys + resamp_resid, 1)
# Plot bootstrap cluster
ax.plot(xs, sp.polyval(pc, xs), "b-", linewidth=2, alpha=3.0 / float(nboot))
return ax
Code
# Computations ----------------------------------------------------------------
# Modeling with Numpy
def equation(a, b):
"""Return a 1D polynomial."""
return np.polyval(a, b)
x = heights
y = weights
p, cov = np.polyfit(x, y, 1, cov=True) # parameters and covariance from of the fit of 1-D polynom.
y_model = equation(p, x) # model using the fit parameters; NOTE: parameters here are coefficients
# Statistics
n = weights.size # number of observations
m = p.size # number of parameters
dof = n - m # degrees of freedom
t = stats.t.ppf(0.975, n - m) # t-statistic; used for CI and PI bands
# Estimates of Error in Data/Model
resid = y - y_model # residuals; diff. actual data from predicted values
chi2 = np.sum((resid / y_model)**2) # chi-squared; estimates error in data
chi2_red = chi2 / dof # reduced chi-squared; measures goodness of fit
s_err = np.sqrt(np.sum(resid**2) / dof) # standard deviation of the error
# Plotting --------------------------------------------------------------------
fig, ax = plt.subplots(figsize=(8, 6))
# Data
ax.plot(
x, y, "o", color="#b9cfe7", markersize=8,
markeredgewidth=1, markeredgecolor="b", markerfacecolor="None"
)
# Fit
ax.plot(x, y_model, "-", color="0.1", linewidth=1.5, alpha=0.5, label="Fit")
x2 = np.linspace(np.min(x), np.max(x), 100)
y2 = equation(p, x2)
# Confidence Interval (select one)
plot_ci_manual(t, s_err, n, x, x2, y2, ax=ax)
#plot_ci_bootstrap(x, y, resid, ax=ax)
# Prediction Interval
pi = t * s_err * np.sqrt(1 + 1/n + (x2 - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
ax.fill_between(x2, y2 + pi, y2 - pi, color="None", linestyle="--")
ax.plot(x2, y2 - pi, "--", color="0.5", label="95% Prediction Limits")
ax.plot(x2, y2 + pi, "--", color="0.5")
#plt.show()
The following modifications are optional, originally implemented to mimic the OP's desired result.
# Figure Modifications --------------------------------------------------------
# Borders
ax.spines["top"].set_color("0.5")
ax.spines["bottom"].set_color("0.5")
ax.spines["left"].set_color("0.5")
ax.spines["right"].set_color("0.5")
ax.get_xaxis().set_tick_params(direction="out")
ax.get_yaxis().set_tick_params(direction="out")
ax.xaxis.tick_bottom()
ax.yaxis.tick_left()
# Labels
plt.title("Fit Plot for Weight", fontsize="14", fontweight="bold")
plt.xlabel("Height")
plt.ylabel("Weight")
plt.xlim(np.min(x) - 1, np.max(x) + 1)
# Custom legend
handles, labels = ax.get_legend_handles_labels()
display = (0, 1)
anyArtist = plt.Line2D((0, 1), (0, 0), color="#b9cfe7") # create custom artists
legend = plt.legend(
[handle for i, handle in enumerate(handles) if i in display] + [anyArtist],
[label for i, label in enumerate(labels) if i in display] + ["95% Confidence Limits"],
loc=9, bbox_to_anchor=(0, -0.21, 1., 0.102), ncol=3, mode="expand"
)
frame = legend.get_frame().set_edgecolor("0.5")
# Save Figure
plt.tight_layout()
plt.savefig("filename.png", bbox_extra_artists=(legend,), bbox_inches="tight")
plt.show()
Output
Using plot_ci_manual():
Using plot_ci_bootstrap():
Hope this helps. Cheers.
Details
I believe that since the legend is outside the figure, it does not show up in matplotblib's popup window. It works fine in Jupyter using %maplotlib inline.
The primary confidence interval code (plot_ci_manual()) is adapted from another source producing a plot similar to the OP. You can select a more advanced technique called residual bootstrapping by uncommenting the second option plot_ci_bootstrap().
Updates
This post has been updated with revised code compatible with Python 3.
stats.t.ppf() accepts the lower tail probability. According to the following resources, t = sp.stats.t.ppf(0.95, n - m) was corrected to t = sp.stats.t.ppf(0.975, n - m) to reflect a two-sided 95% t-statistic (or one-sided 97.5% t-statistic).
original notebook and equation
statistics reference (thanks #Bonlenfum and #tryptofan)
verified t-value given dof=17
y2 was updated to respond more flexibly with a given model (#regeneration).
An abstracted equation function was added to wrap the model function. Non-linear regressions are possible although not demonstrated. Amend appropriate variables as needed (thanks #PJW).
See Also
This post on plotting bands with statsmodels library.
This tutorial on plotting bands and computing confidence intervals with uncertainties library (install with caution in a separate environment).

You can use seaborn plotting library to create plots as you want.
In [18]: import seaborn as sns
In [19]: heights = np.array([50,52,53,54,58,60,62,64,66,67, 68,70,72,74,76,55,50,45,65])
...: weights = np.array([25,50,55,75,80,85,50,65,85,55,45,45,50,75,95,65,50,40,45])
...:
In [20]: sns.regplot(heights,weights, color ='blue')
Out[20]: <matplotlib.axes.AxesSubplot at 0x13644f60>

I need to do this sort of plot occasionally... this was my first time doing it with Python/Jupyter, and this post helps me a lot, especially the detailed Pylang answer.
I know there are 'easier' ways to get there, but I think this way is much more didactic and allows me to learn step by step what's going on. I even learned here that there are 'prediction intervals'! Thanks.
Below is the Pylang code in a more straightforward fashion, including the calculation of Pearson's correlation (and so the r2) and the mean square error (MSE). Of course, the final plot (!) must be adapted for every dataset...
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
heights = np.array([50,52,53,54,58,60,62,64,66,67,68,70,72,74,76,55,50,45,65])
weights = np.array([25,50,55,75,80,85,50,65,85,55,45,45,50,75,95,65,50,40,45])
x = heights
y = weights
slope, intercept = np.polyfit(x, y, 1) # linear model adjustment
y_model = np.polyval([slope, intercept], x) # modeling...
x_mean = np.mean(x)
y_mean = np.mean(y)
n = x.size # number of samples
m = 2 # number of parameters
dof = n - m # degrees of freedom
t = stats.t.ppf(0.975, dof) # Students statistic of interval confidence
residual = y - y_model
std_error = (np.sum(residual**2) / dof)**.5 # Standard deviation of the error
# calculating the r2
# https://www.statisticshowto.com/probability-and-statistics/coefficient-of-determination-r-squared/
# Pearson's correlation coefficient
numerator = np.sum((x - x_mean)*(y - y_mean))
denominator = ( np.sum((x - x_mean)**2) * np.sum((y - y_mean)**2) )**.5
correlation_coef = numerator / denominator
r2 = correlation_coef**2
# mean squared error
MSE = 1/n * np.sum( (y - y_model)**2 )
# to plot the adjusted model
x_line = np.linspace(np.min(x), np.max(x), 100)
y_line = np.polyval([slope, intercept], x_line)
# confidence interval
ci = t * std_error * (1/n + (x_line - x_mean)**2 / np.sum((x - x_mean)**2))**.5
# predicting interval
pi = t * std_error * (1 + 1/n + (x_line - x_mean)**2 / np.sum((x - x_mean)**2))**.5
############### Ploting
plt.rcParams.update({'font.size': 14})
fig = plt.figure()
ax = fig.add_axes([.1, .1, .8, .8])
ax.plot(x, y, 'o', color = 'royalblue')
ax.plot(x_line, y_line, color = 'royalblue')
ax.fill_between(x_line, y_line + pi, y_line - pi, color = 'lightcyan', label = '95% prediction interval')
ax.fill_between(x_line, y_line + ci, y_line - ci, color = 'skyblue', label = '95% confidence interval')
ax.set_xlabel('x')
ax.set_ylabel('y')
# rounding and position must be changed for each case and preference
a = str(np.round(intercept))
b = str(np.round(slope,2))
r2s = str(np.round(r2,2))
MSEs = str(np.round(MSE))
ax.text(45, 110, 'y = ' + a + ' + ' + b + ' x')
ax.text(45, 100, '$r^2$ = ' + r2s + ' MSE = ' + MSEs)
plt.legend(bbox_to_anchor=(1, .25), fontsize=12)

For a project of mine, I needed to create intervals for time-series modeling, and to make the procedure more efficient I created tsmoothie: A python library for time-series smoothing and outlier detection in a vectorized way.
It provides different smoothing algorithms together with the possibility to computes intervals.
In the case of linear regression:
import numpy as np
import matplotlib.pyplot as plt
from tsmoothie.smoother import *
from tsmoothie.utils_func import sim_randomwalk
# generate 10 randomwalks of length 50
np.random.seed(33)
data = sim_randomwalk(n_series=10, timesteps=50,
process_noise=10, measure_noise=30)
# operate smoothing
smoother = PolynomialSmoother(degree=1)
smoother.smooth(data)
# generate intervals
low_pi, up_pi = smoother.get_intervals('prediction_interval', confidence=0.05)
low_ci, up_ci = smoother.get_intervals('confidence_interval', confidence=0.05)
# plot the first smoothed timeseries with intervals
plt.figure(figsize=(11,6))
plt.plot(smoother.smooth_data[0], linewidth=3, color='blue')
plt.plot(smoother.data[0], '.k')
plt.fill_between(range(len(smoother.data[0])), low_pi[0], up_pi[0], alpha=0.3, color='blue')
plt.fill_between(range(len(smoother.data[0])), low_ci[0], up_ci[0], alpha=0.3, color='blue')
In the case of regression with order bigger than 1:
# operate smoothing
smoother = PolynomialSmoother(degree=5)
smoother.smooth(data)
# generate intervals
low_pi, up_pi = smoother.get_intervals('prediction_interval', confidence=0.05)
low_ci, up_ci = smoother.get_intervals('confidence_interval', confidence=0.05)
# plot the first smoothed timeseries with intervals
plt.figure(figsize=(11,6))
plt.plot(smoother.smooth_data[0], linewidth=3, color='blue')
plt.plot(smoother.data[0], '.k')
plt.fill_between(range(len(smoother.data[0])), low_pi[0], up_pi[0], alpha=0.3, color='blue')
plt.fill_between(range(len(smoother.data[0])), low_ci[0], up_ci[0], alpha=0.3, color='blue')
I point out also that tsmoothie can carry out the smoothing of multiple time-series in a vectorized way. Hope this can help someone

Related

scipy weird unexpected behavior curve_fit large data set for sin wave

For some reason when I am trying to large amount of data to a sin wave it fails and fits it to a horizontal line. Can somebody explain?
Minimal working code:
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize
# Seed the random number generator for reproducibility
import pandas
np.random.seed(0)
# Here it work as expected
# x_data = np.linspace(-5, 5, num=50)
# y_data = 2.9 * np.sin(1.05 * x_data + 2) + 250 + np.random.normal(size=50)
# With this data it breaks
x_data = np.linspace(0, 2500, num=2500)
y_data = -100 * np.sin(0.01 * x_data + 1) + 250 + np.random.normal(size=2500)
# And plot it
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data)
def test_func(x, a, b, c, d):
return a * np.sin(b * x + c) + d
# Used to fit the correct function
# params, params_covariance = optimize.curve_fit(test_func, x_data, y_data)
# making some guesses
params, params_covariance = optimize.curve_fit(test_func, x_data, y_data,
p0=[-80, 3, 0, 260])
print(params)
plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_data, test_func(x_data, *params),
label='Fitted function')
plt.legend(loc='best')
plt.show()
Does anybody know, how to fix this issue. Should I use a different fitting method not least square? Or should I reduce the number of data points?
Given your data, you can use the more robust lmfit instead of scipy.
In particular, you can use SineModel (see here for details).
SineModel in lmfit is not for "shifted" sine waves, but you can easily deal with the shift doing
y_data_offset = y_data.mean()
y_transformed = y_data - y_data_offset
plt.scatter(x_data, y_transformed)
plt.axhline(0, color='r')
Now you can fit to sine wave
from lmfit.models import SineModel
mod = SineModel()
pars = mod.guess(y_transformed, x=x_data)
out = mod.fit(y_transformed, pars, x=x_data)
you can inspect results with print(out.fit_report()) and plot results with
plt.plot(x_data, y_data, lw=7, color='C1')
plt.plot(x_data, out.best_fit+y_data_offset, color='k')
# we add the offset ^^^^^^^^^^^^^
or with the builtin plot method out.plot_fit(), see here for details.
Note that in SineModel all parameters "are constrained to be non-negative", so your defined negative amplitude (-100) will be positive (+100) in the parameters fit results. So the phase too won't be 1 but π+1 (PS: they call shift the phase)
print(out.best_values)
{'amplitude': 99.99631403054289,
'frequency': 0.010001193681616227,
'shift': 4.1400215410836605}

How to show precentage in Seaborn countplot [duplicate]

I was wondering if it is possible to create a Seaborn count plot, but instead of actual counts on the y-axis, show the relative frequency (percentage) within its group (as specified with the hue parameter).
I sort of fixed this with the following approach, but I can't imagine this is the easiest approach:
# Plot percentage of occupation per income class
grouped = df.groupby(['income'], sort=False)
occupation_counts = grouped['occupation'].value_counts(normalize=True, sort=False)
occupation_data = [
{'occupation': occupation, 'income': income, 'percentage': percentage*100} for
(income, occupation), percentage in dict(occupation_counts).items()
]
df_occupation = pd.DataFrame(occupation_data)
p = sns.barplot(x="occupation", y="percentage", hue="income", data=df_occupation)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
Result:
I'm using the well known adult data set from the UCI machine learning repository. The pandas dataframe is created like this:
# Read the adult dataset
df = pd.read_csv(
"data/adult.data",
engine='c',
lineterminator='\n',
names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week',
'native_country', 'income'],
header=None,
skipinitialspace=True,
na_values="?"
)
This question is sort of related, but does not make use of the hue parameter. And in my case I cannot just change the labels on the y-axis, because the height of the bar must depend on the group.
With newer versions of seaborn you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
(df
.groupby(x)[y]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
output
Update: Also show percentages on top of barplots
If you also want percentages, you can do following:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()
x,y = 'class', 'survived'
df1 = df.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1)
g.ax.set_ylim(0,100)
for p in g.ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x()
txt_y = p.get_height()
g.ax.text(txt_x,txt_y,txt)
I might be confused. The difference between your output and the output of
occupation_counts = (df.groupby(['income'])['occupation']
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
.sort_values('occupation'))
p = sns.barplot(x="occupation", y="percentage", hue="income", data=occupation_counts)
_ = plt.setp(p.get_xticklabels(), rotation=90) # Rotate labels
is, it seems to me, only the order of the columns.
And you seem to care about that, since you pass sort=False. But then, in your code the order is determined uniquely by chance (and the order in which the dictionary is iterated even changes from run to run with Python 3.5).
You could do this with sns.histplot by setting the following properties:
stat = 'density' (this will make the y-axis the density rather than count)
common_norm = False (this will normalize each density independently)
See the simple example below:
import numpy as np
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(x = df['class'], hue=df['survived'], multiple="dodge",
stat = 'density', shrink = 0.8, common_norm=False)
You can use the library Dexplot to do counting as well as normalizing over any variable to get relative frequencies.
Pass the count function the name of the variable you would like to count and it will automatically produce a bar plot of the counts of all unique values. Use split to subdivide the counts by another variable. Notice that Dexplot automatically wraps the x-tick labels.
dxp.count('occupation', data=df, split='income')
Use the normalize parameter to normalize the counts over any variable (or combination of variables with a list). You can also use True to normalize over the grand total of counts.
dxp.count('occupation', data=df, split='income', normalize='income')
It boggled my mind that Seaborn doesn't provide anything like this out of the box.
Still, it was pretty easy to tweak the source code to get what you wanted.
The following code, with the function "percentageplot(x, hue, data)" works just like sns.countplot, but norms each bar per group (i.e. divides each green bar's value by the sum of all green bars)
In effect, it turns this (hard to interpret because different N of Apple vs. Android):
sns.countplot
into this (Normed so that bars reflect proportion of total for Apple, vs Android):
Percentageplot
Hope this helps!!
from seaborn.categorical import _CategoricalPlotter, remove_na
import matplotlib as mpl
class _CategoricalStatPlotter(_CategoricalPlotter):
#property
def nested_width(self):
"""A float with the width of plot elements when hue nesting is used."""
return self.width / len(self.hue_names)
def estimate_statistic(self, estimator, ci, n_boot):
if self.hue_names is None:
statistic = []
confint = []
else:
statistic = [[] for _ in self.plot_data]
confint = [[] for _ in self.plot_data]
for i, group_data in enumerate(self.plot_data):
# Option 1: we have a single layer of grouping
# --------------------------------------------
if self.plot_hues is None:
if self.plot_units is None:
stat_data = remove_na(group_data)
unit_data = None
else:
unit_data = self.plot_units[i]
have = pd.notnull(np.c_[group_data, unit_data]).all(axis=1)
stat_data = group_data[have]
unit_data = unit_data[have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic.append(np.nan)
else:
statistic.append(estimator(stat_data, len(np.concatenate(self.plot_data))))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint.append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint.append(utils.ci(boots, ci))
# Option 2: we are grouping by a hue layer
# ----------------------------------------
else:
for j, hue_level in enumerate(self.hue_names):
if not self.plot_hues[i].size:
statistic[i].append(np.nan)
if ci is not None:
confint[i].append((np.nan, np.nan))
continue
hue_mask = self.plot_hues[i] == hue_level
group_total_n = (np.concatenate(self.plot_hues) == hue_level).sum()
if self.plot_units is None:
stat_data = remove_na(group_data[hue_mask])
unit_data = None
else:
group_units = self.plot_units[i]
have = pd.notnull(
np.c_[group_data, group_units]
).all(axis=1)
stat_data = group_data[hue_mask & have]
unit_data = group_units[hue_mask & have]
# Estimate a statistic from the vector of data
if not stat_data.size:
statistic[i].append(np.nan)
else:
statistic[i].append(estimator(stat_data, group_total_n))
# Get a confidence interval for this estimate
if ci is not None:
if stat_data.size < 2:
confint[i].append([np.nan, np.nan])
continue
boots = bootstrap(stat_data, func=estimator,
n_boot=n_boot,
units=unit_data)
confint[i].append(utils.ci(boots, ci))
# Save the resulting values for plotting
self.statistic = np.array(statistic)
self.confint = np.array(confint)
# Rename the value label to reflect the estimation
if self.value_label is not None:
self.value_label = "{}({})".format(estimator.__name__,
self.value_label)
def draw_confints(self, ax, at_group, confint, colors,
errwidth=None, capsize=None, **kws):
if errwidth is not None:
kws.setdefault("lw", errwidth)
else:
kws.setdefault("lw", mpl.rcParams["lines.linewidth"] * 1.8)
for at, (ci_low, ci_high), color in zip(at_group,
confint,
colors):
if self.orient == "v":
ax.plot([at, at], [ci_low, ci_high], color=color, **kws)
if capsize is not None:
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_low, ci_low], color=color, **kws)
ax.plot([at - capsize / 2, at + capsize / 2],
[ci_high, ci_high], color=color, **kws)
else:
ax.plot([ci_low, ci_high], [at, at], color=color, **kws)
if capsize is not None:
ax.plot([ci_low, ci_low],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
ax.plot([ci_high, ci_high],
[at - capsize / 2, at + capsize / 2],
color=color, **kws)
class _BarPlotter(_CategoricalStatPlotter):
"""Show point estimates and confidence intervals with bars."""
def __init__(self, x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation, errcolor, errwidth=None,
capsize=None):
"""Initialize the plotter."""
self.establish_variables(x, y, hue, data, orient,
order, hue_order, units)
self.establish_colors(color, palette, saturation)
self.estimate_statistic(estimator, ci, n_boot)
self.errcolor = errcolor
self.errwidth = errwidth
self.capsize = capsize
def draw_bars(self, ax, kws):
"""Draw the bars onto `ax`."""
# Get the right matplotlib function depending on the orientation
barfunc = ax.bar if self.orient == "v" else ax.barh
barpos = np.arange(len(self.statistic))
if self.plot_hues is None:
# Draw the bars
barfunc(barpos, self.statistic, self.width,
color=self.colors, align="center", **kws)
# Draw the confidence intervals
errcolors = [self.errcolor] * len(barpos)
self.draw_confints(ax,
barpos,
self.confint,
errcolors,
self.errwidth,
self.capsize)
else:
for j, hue_level in enumerate(self.hue_names):
# Draw the bars
offpos = barpos + self.hue_offsets[j]
barfunc(offpos, self.statistic[:, j], self.nested_width,
color=self.colors[j], align="center",
label=hue_level, **kws)
# Draw the confidence intervals
if self.confint.size:
confint = self.confint[:, j]
errcolors = [self.errcolor] * len(offpos)
self.draw_confints(ax,
offpos,
confint,
errcolors,
self.errwidth,
self.capsize)
def plot(self, ax, bar_kws):
"""Make the plot."""
self.draw_bars(ax, bar_kws)
self.annotate_axes(ax)
if self.orient == "h":
ax.invert_yaxis()
def percentageplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None,
orient=None, color=None, palette=None, saturation=.75,
ax=None, **kwargs):
# Estimator calculates required statistic (proportion)
estimator = lambda x, y: (float(len(x))/y)*100
ci = None
n_boot = 0
units = None
errcolor = None
if x is None and y is not None:
orient = "h"
x = y
elif y is None and x is not None:
orient = "v"
y = x
elif x is not None and y is not None:
raise TypeError("Cannot pass values for both `x` and `y`")
else:
raise TypeError("Must pass values for either `x` or `y`")
plotter = _BarPlotter(x, y, hue, data, order, hue_order,
estimator, ci, n_boot, units,
orient, color, palette, saturation,
errcolor)
plotter.value_label = "Percentage"
if ax is None:
ax = plt.gca()
plotter.plot(ax, kwargs)
return ax
You can provide estimators for the height of the bar (along y axis) in a seaborn countplot by using the estimator keyword.
ax = sns.barplot(x="x", y="x", data=df, estimator=lambda x: len(x) / len(df) * 100)
The above code snippet is from https://github.com/mwaskom/seaborn/issues/1027
They have a whole discussion about how to provide percentages in a countplot. This answer is based off the same thread linked above.
In the context of your specific problem, you can probably do something like this:
ax = sb.barplot(x='occupation', y='some_numeric_column', data=raw_data, estimator=lambda x: len(x) / len(raw_data) * 100, hue='income')
ax.set(ylabel="Percent")
The above code worked for me (on a different dataset with different attributes). Note that you need to put in some numeric column for y else, it gives an error: "ValueError: Neither the x nor y variable appears to be numeric."
From this answer, and using "probability" worked best.
Taken from sns.histplot documentation on the "stat" parameter:
Aggregate statistic to compute in each bin.
count: show the number of observations in each bin
frequency: show the number of observations divided by the bin width
probability: or proportion: normalize such that bar heights sum to 1
percent: normalize such that bar heights sum to 100
density: normalize such that the total area of the histogram equals 1
import seaborn as sns
df = sns.load_dataset('titanic')
ax = sns.histplot(
x = df['class'],
hue=df['survived'],
multiple="dodge",
stat = 'probability',
shrink = 0.5,
common_norm=False
)

Using scipy.odr to fit curve

I'm trying to fit a set of data points via a fit function that depends on two variables, let's call these xdata and sdata. Problem is my curve is rather flat I want it to more or less "follow the points".
I've tried using scipy.odr to fit the curve it works rather well except that the curve is too flat:
import numpy as np
from math import pi
from math import sqrt
from math import log
from scipy import optimize
import scipy.optimize
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.odr import *
mudr=np.array([ 57.43708609, 46.26119205, 55.60688742, 33.21615894,
28.27072848, 22.54649007, 21.80662252, 11.21483444, 5.80211921])
#xdata points
dme=array([ 128662.54890776, 105265.32915726, 128652.56835434,
77968.67019573, 66273.56542068, 58464.58559543,
54570.66624991, 27286.90038703, 19480.92689266]) #xdata error
dmss22=np.array([ 4.90050000e+17, 4.90050000e+17, 4.90050000e+17,
4.90050000e+17, 4.90050000e+17, 4.90050000e+17,
4.90050000e+17, 4.90050000e+17, 4.90050000e+17]) #sdata points
dmse=np.array([ 1.09777592e+21, 1.11512117e+21, 1.13381702e+21,
1.15033267e+21, 1.14883089e+21, 1.27076265e+21,
1.22637165e+21, 1.19237598e+21, 1.64539205e+21]) # sdata error
F=np.array([ 115.01944248, 110.24354867, 112.77812389, 104.81830088,
104.35746903, 101.32016814, 100.54513274, 96.94226549,
93.00424779]) #ydata points
dF=np.array([ 72710.75386699, 72590.6256987 , 176539.40403673,
130555.27503081, 124299.52080164, 176426.64340597,
143013.52848306, 122117.93022746, 157547.78395513])#ydata error
def Ffitsso(p,X,B=2.58,Fc=92.2,mu=770,Za=0.9468): #fitfunction
temp1 = (2*B*X[0])/(4*pi*Fc)**2
temp2 = temp1*(afij[0]+afij[1]*np.log((2*B*X[0])/mu**2))
temp3 = temp1**2*(afij[2]+afij[3]*np.log((2*B*X[0])/mu**2)+\
afij[4]*(np.log((2*B*X[0])/mu**2))**2)
temp4 = temp1**3*(afij[5]+afij[6]*np.log((2*B*X[0])/mu**2)+\
afij[7]*(np.log((2*B*X[0])/mu**2))**2+\
afij[8]*(np.log((2*B*X[0])/mu**2))**3)
return Fc/Za*(1+p[0]*X[1])*(1+temp2+temp3+temp4)+p[1]
#fitting using scipy.odr
xtot=np.row_stack( (mudr, dmss22) )
etot=np.row_stack( (Ze, dmss22e) )
fitting = Model(Ffitsso)
mydata = RealData(xtot, F, sx=etot2, sy=dF)
myodr = ODR(mydata, fitting, beta0=[0, 100])
myoutput = myodr.run()
myoutput.pprint()
bet=myoutput.beta
plt.plot(mudr,F,"b^")
plt.plot(mudr,Ffitsso(bet,[mudr,dmss22]))
p[0]*X[0] in the fitfunction is supposed to be small compared to 1 but with the fit the value for p[0] is in order of e-18 whilst dmss22 values are in the order of e-17 which is not small enough.
Even worse is that it's negative meaning the function decreases which is not supposed to happen it's supposed to increase like the plotted data points.
Edit: I fixed, didn't know that it was so sensitive to initial beta values, put beta[0]=1.5*10(-15) and it works!**
Here is a graphical fitter with both curve_fit and ODR fitters using scipy's Differential Evolution (DE) genetic algorithm to supply initial parameter estimates for the non-linear solvers. The scipy implementation of DE uses the Latin Hypercube algorithm to ensure a thorough search of parameter space, and this requires parameter bounds within which to search - in this example, these bounds are taken from the data maximum and minimum values. Note that it is much easier to give bounds for the initial parameter estimates rather than individual specific values.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.odr
from scipy.optimize import differential_evolution
import warnings
xData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.0, 6.6, 7.7, 0.0])
yData = numpy.array([1.1, 20.2, 30.3, 40.4, 50.0, 60.6, 70.7, 0.1])
def func(x, a, b, c, d, offset): # curve fitting function for curve_fit()
return a*numpy.exp(-(x-b)**2/(2*c**2)+d) + offset
def func_wrapper_for_ODR(parameters, x): # parameter order for ODR
return func(x, *parameters)
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
val = func(xData, *parameterTuple)
return numpy.sum((yData - val) ** 2.0)
def generate_Initial_Parameters():
# min and max used for bounds
maxX = max(xData)
minX = min(xData)
maxY = max(yData)
minY = min(yData)
parameterBounds = []
parameterBounds.append([minY, maxY]) # search bounds for a
parameterBounds.append([minX, maxX]) # search bounds for b
parameterBounds.append([minX, maxX]) # search bounds for c
parameterBounds.append([minY, maxY]) # search bounds for d
parameterBounds.append([0.0, maxY]) # search bounds for Offset
# "seed" the numpy random number generator for repeatable results
result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
return result.x
geneticParameters = generate_Initial_Parameters()
##########################
# curve_fit section
##########################
fittedParameters_curvefit, pcov = curve_fit(func, xData, yData, geneticParameters)
print('Fitted parameters curve_fit:', fittedParameters_curvefit)
print()
modelPredictions_curvefit = func(xData, *fittedParameters_curvefit)
absError_curvefit = modelPredictions_curvefit - yData
SE_curvefit = numpy.square(absError_curvefit) # squared errors
MSE_curvefit = numpy.mean(SE_curvefit) # mean squared errors
RMSE_curvefit = numpy.sqrt(MSE_curvefit) # Root Mean Squared Error, RMSE
Rsquared_curvefit = 1.0 - (numpy.var(absError_curvefit) / numpy.var(yData))
print()
print('RMSE curve_fit:', RMSE_curvefit)
print('R-squared curve_fit:', Rsquared_curvefit)
print()
##########################
# ODR section
##########################
data = scipy.odr.odrpack.Data(xData,yData)
model = scipy.odr.odrpack.Model(func_wrapper_for_ODR)
odr = scipy.odr.odrpack.ODR(data, model, beta0=geneticParameters)
# Run the regression.
odr_out = odr.run()
print('Fitted parameters ODR:', odr_out.beta)
print()
modelPredictions_odr = func(xData, *odr_out.beta)
absError_odr = modelPredictions_odr - yData
SE_odr = numpy.square(absError_odr) # squared errors
MSE_odr = numpy.mean(SE_odr) # mean squared errors
RMSE_odr = numpy.sqrt(MSE_odr) # Root Mean Squared Error, RMSE
Rsquared_odr = 1.0 - (numpy.var(absError_odr) / numpy.var(yData))
print()
print('RMSE ODR:', RMSE_odr)
print('R-squared ODR:', Rsquared_odr)
print()
##########################################################
# graphics output section
def ModelsAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plots
xModel = numpy.linspace(min(xData), max(xData))
yModel_curvefit = func(xModel, *fittedParameters_curvefit)
yModel_odr = func(xModel, *odr_out.beta)
# now the models as line plots
axes.plot(xModel, yModel_curvefit)
axes.plot(xModel, yModel_odr)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.show()
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelsAndScatterPlot(graphWidth, graphHeight)

Matplotlib: different scale on negative side of the axis

Background
I am trying to show three variables on a single plot. I have connected the three points using lines of different colours based on some other variables. This is shown here
Problem
What I want to do is to have a different scale on the negative x-axis. This would help me in providing positive x_ticks, different axis label and also clear and uncluttered representation of the lines on left side of the image
Question
How to have a different positive x-axis starting from 0 towards negative direction?
Have xticks based on data plotted in that direction
Have a separate xlabel for this new axis
Additional information
I have checked other questions regarding inclusion of multiple axes e.g. this and this. However, these questions did not serve the purpose.
Code Used
font_size = 20
plt.rcParams.update({'font.size': font_size})
fig = plt.figure()
ax = fig.add_subplot(111)
#read my_data from file or create it
for case in my_data:
#Iterating over my_data
if condition1 == True:
local_linestyle = '-'
local_color = 'r'
local_line_alpha = 0.6
elif condition2 == 1:
local_linestyle = '-'
local_color = 'b'
local_line_alpha = 0.6
else:
local_linestyle = '--'
local_color = 'g'
local_line_alpha = 0.6
datapoint = [case[0], case[1], case[2]]
plt.plot(datapoint[0], 0, color=local_color)
plt.plot(-datapoint[2], 0, color=local_color)
plt.plot(0, datapoint[1], color=local_color)
plt.plot([datapoint[0], 0], [0, datapoint[1]], linestyle=local_linestyle, color=local_color)
plt.plot([-datapoint[2], 0], [0, datapoint[1]], linestyle=local_linestyle, color=local_color)
plt.show()
exit()
You can define a custom scale, where values below zero are scaled differently than those above zero.
import numpy as np
from matplotlib import scale as mscale
from matplotlib import transforms as mtransforms
from matplotlib.ticker import FuncFormatter
class AsymScale(mscale.ScaleBase):
name = 'asym'
def __init__(self, axis, **kwargs):
mscale.ScaleBase.__init__(self)
self.a = kwargs.get("a", 1)
def get_transform(self):
return self.AsymTrans(self.a)
def set_default_locators_and_formatters(self, axis):
# possibly, set a different locator and formatter here.
fmt = lambda x,pos: "{}".format(np.abs(x))
axis.set_major_formatter(FuncFormatter(fmt))
class AsymTrans(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
def __init__(self, a):
mtransforms.Transform.__init__(self)
self.a = a
def transform_non_affine(self, x):
return (x >= 0)*x + (x < 0)*x*self.a
def inverted(self):
return AsymScale.InvertedAsymTrans(self.a)
class InvertedAsymTrans(AsymTrans):
def transform_non_affine(self, x):
return (x >= 0)*x + (x < 0)*x/self.a
def inverted(self):
return AsymScale.AsymTrans(self.a)
Using this you would provide a scale parameter a that scales the negative part of the axes.
# Now that the Scale class has been defined, it must be registered so
# that ``matplotlib`` can find it.
mscale.register_scale(AsymScale)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot([-2, 0, 5], [0,1,0])
ax.set_xscale("asym", a=2)
ax.annotate("negative axis", xy=(.25,0), xytext=(0,-30),
xycoords = "axes fraction", textcoords="offset points", ha="center")
ax.annotate("positive axis", xy=(.75,0), xytext=(0,-30),
xycoords = "axes fraction", textcoords="offset points", ha="center")
plt.show()
The question is not very clear about what xticks and labels are desired, so I left that out for now.
Here's how to get what you want. This solution uses two twined axes object to get different scaling to the left and right of the origin, and then hides all the evidence:
import matplotlib.pyplot as plt
import matplotlib as mpl
from numbers import Number
tickkwargs = {m+k:False for k in ('bottom','top','left','right') for m in ('','label')}
p = np.zeros((10, 3, 2))
p[:,0,0] -= np.arange(10)*.1 + .5
p[:,1,1] += np.repeat(np.arange(5), 2)*.1 + .3
p[:,2,0] += np.arange(10)*.5 + 2
fig = plt.figure(figsize=(8,6))
host = fig.add_subplot(111)
par = host.twiny()
host.set_xlim(-6, 6)
par.set_xlim(-1, 1)
for ps in p:
# mask the points with negative x values
ppos = ps[ps[:,0] >= 0].T
host.plot(*ppos)
# mask the points with positive x values
pneg = ps[ps[:,0] <= 0].T
par.plot(*pneg)
# hide all possible ticks/notation text that could be set by the second x axis
par.tick_params(axis="both", **tickkwargs)
par.xaxis.get_offset_text().set_visible(False)
# fix the x tick labels so they're all positive
host.set_xticklabels(np.abs(host.get_xticks()))
fig.show()
Output:
Here's what the set of points p I used in the code above look like when plotted normally:
fig = plt.figure(figsize=(8,6))
ax = fig.gca()
for ps in p:
ax.plot(*ps.T)
fig.show()
Output:
The method of deriving a class of mscale.ScaleBase as shown in other answers may be too complicated for your purpose.
You can pass two scale transform functions to set_xscale or set_yscale, something like the following.
def get_scale(a=1): # a is the scale of your negative axis
def forward(x):
x = (x >= 0) * x + (x < 0) * x * a
return x
def inverse(x):
x = (x >= 0) * x + (x < 0) * x / a
return x
return forward, inverse
fig, ax = plt.subplots()
forward, inverse = get_scale(a=3)
ax.set_xscale('function', functions=(forward, inverse)) # this is for setting x axis
# do plotting
More examples can be found in this doc.

Discrete Color Bar with Tick labels in between colors

I am trying to plot some data with a discrete color bar. I was following the example given (https://gist.github.com/jakevdp/91077b0cae40f8f8244a) but the issue is this example does not work 1-1 with different spacing. For example, the spacing in the example in the link is for only increasing by 1 but my data is increasing by 0.5. You can see the output from the code I have.. Any help with this would be appreciated. I know I am missing something key here but cant figure it out.
import matplotlib.pylab as plt
import numpy as np
def discrete_cmap(N, base_cmap=None):
"""Create an N-bin discrete colormap from the specified input map"""
# Note that if base_cmap is a string or None, you can simply do
# return plt.cm.get_cmap(base_cmap, N)
# The following works for string, None, or a colormap instance:
base = plt.cm.get_cmap(base_cmap)
color_list = base(np.linspace(0, 1, N))
cmap_name = base.name + str(N)
return base.from_list(cmap_name, color_list, N)
num=11
x = np.random.randn(40)
y = np.random.randn(40)
c = np.random.randint(num, size=40)
plt.figure(figsize=(10,7.5))
plt.scatter(x, y, c=c, s=50, cmap=discrete_cmap(num, 'jet'))
plt.colorbar(ticks=np.arange(0,5.5,0.5))
plt.clim(-0.5, num - 0.5)
plt.show()
Not sure what version of matplotlib/pyplot introduced this, but plt.get_cmap now supports an int argument specifying the number of colors you want to get, for discrete colormaps.
This automatically results in the colorbar being discrete.
By the way, pandas has an even better handling of the colorbar.
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')
# remove if not using Jupyter/IPython
%matplotlib inline
# choose number of clusters and number of points in each cluster
n_clusters = 5
n_samples = 20
# there are fancier ways to do this
clusters = np.array([k for k in range(n_clusters) for i in range(n_samples)])
# generate the coordinates of the center
# of each cluster by shuffling a range of values
clusters_x = np.arange(n_clusters)
clusters_y = np.arange(n_clusters)
np.random.shuffle(clusters_x)
np.random.shuffle(clusters_y)
# get dicts like cluster -> center coordinate
x_dict = dict(enumerate(clusters_x))
y_dict = dict(enumerate(clusters_y))
# get coordinates of cluster center for each point
x = np.array(list(x_dict[k] for k in clusters)).astype(float)
y = np.array(list(y_dict[k] for k in clusters)).astype(float)
# add noise
x += np.random.normal(scale=0.5, size=n_clusters*n_samples)
y += np.random.normal(scale=0.5, size=n_clusters*n_samples)
### Finally, plot
fig, ax = plt.subplots(figsize=(12,8))
# get discrete colormap
cmap = plt.get_cmap('viridis', n_clusters)
# scatter points
scatter = ax.scatter(x, y, c=clusters, cmap=cmap)
# scatter cluster centers
ax.scatter(clusters_x, clusters_y, c='red')
# add colorbar
cbar = plt.colorbar(scatter)
# set ticks locations (not very elegant, but it works):
# - shift by 0.5
# - scale so that the last value is at the center of the last color
tick_locs = (np.arange(n_clusters) + 0.5)*(n_clusters-1)/n_clusters
cbar.set_ticks(tick_locs)
# set tick labels (as before)
cbar.set_ticklabels(np.arange(n_clusters))
Ok so this is the hack I found for my own question. I am sure there is a better way to do this but this works for what I am doing. Feel free to suggest a better way to do this.
import numpy as np
import matplotlib.pylab as plt
def discrete_cmap(N, base_cmap=None):
"""Create an N-bin discrete colormap from the specified input map"""
# Note that if base_cmap is a string or None, you can simply do
# return plt.cm.get_cmap(base_cmap, N)
# The following works for string, None, or a colormap instance:
base = plt.cm.get_cmap(base_cmap)
color_list = base(np.linspace(0, 1, N))
cmap_name = base.name + str(N)
return base.from_list(cmap_name, color_list, N)
num=11
plt.figure(figsize=(10,7.5))
x = np.random.randn(40)
y = np.random.randn(40)
c = np.random.randint(num, size=40)
plt.scatter(x, y, c=c, s=50, cmap=discrete_cmap(num, 'jet'))
cbar=plt.colorbar(ticks=range(num))
plt.clim(-0.5, num - 0.5)
cbar.ax.set_yticklabels(np.arange(0.0,5.5,0.5))
plt.show()
For some reason I cannot upload the image associated with the code above. I get an error when uploading so not sure how to show the final example. But simply I set the color bar axes for tick labels for a vertical color bar and passed in the labels I want and it produced the correct output.