How to add droplines to a seaborn scatterplot? - matplotlib

Using the following example code in a Jupyter notebook:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
df = pd.DataFrame(np.random.rand(5, 2), columns=['a', 'b'])
sns.set()
g = sns.relplot(data=df, x='a', y='b', kind='scatter');
g.set(xlim=(0, 1))
g.set(ylim=(0, 1));
The resulting plot shows the data points, but I would also like to have vertical drop lines and occasionally horizontal ones as well. To clarify what I mean by droplines, here is a mockup of the actual vs. the desired output:
Update: A little more complex input that makes it harder to manually draw the lines:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
df = pd.DataFrame(np.random.rand(20, 3), columns=['a', 'b', 'c'])
df['d'] = ['apples', 'bananas', 'cherries', 'dates'] * 5
sns.set()
g = sns.relplot(data=df, x='a', y='b', hue='c', col='d', col_wrap=2, kind='scatter');
g.set(xlim=(0, 1))
g.set(ylim=(0, 1));

There are several ways to plot vertical/horizontal lines. One of the is to use hlines or vlines. This can be done using a loop for sake of ease.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(121)
fig, ax = plt.subplots()
df = pd.DataFrame(np.random.rand(5, 2), columns=['a', 'b'])
sns.set()
g = sns.relplot(data=df, x='a', y='b', kind='scatter', color='blue', ax=ax);
for x, y in zip(df['a'], df['b']):
ax.hlines(y, 0, x, color='blue')
ax.vlines(x, 0, y, color='blue')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
plt.close(g.fig)

Related

who to plot stats.probplot in a grid?

I have a data frame with four columns I would like to plot the normality test for each column in a 2*2 grid, but it only plot one figure, and the else is empty.
import random
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2,2, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
data = {'col1': [random.randrange(1, 50, 1) for i in range(1000)], 'col2': [random.randrange(1, 50, 1) for i in range(1000)],'col3':[random.randrange(1, 50, 1) for i in range(1000)]
,'col4':[random.randrange(1, 50, 1) for i in range(1000)]}
df = pd.DataFrame(data)
for ax, d in zip(axs.ravel(), df):
ax=stats.probplot(df[d], plot=plt)
#ax.set_title(str(d))
plt.show()
is there a way to construct the subplot and the stats.probplot within a loop?
In your code, you need to change the for loop to this:
for ax, d in zip(axs.ravel(), df):
stats.probplot(df[d], plot=ax)
#ax.set_titl(str(d))
plt.show()
I hope this will help you move on.

How to annotate in 2 decimal places using Matplotlib

I am trying to create a heatmap displaying correlation coefficient values. I'm quite new at this, but the code below would annotate in multiple decimal places, whereas i'm trying to narrow down to 2 d.p.
Does anyone have experience with this?
import pandas_datareader.data as web
import pandas as pd
import datetime as dt
import csv
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import seaborn as sns
style.use('ggplot')
def visualize_data():
df = pd.read_csv('sti_joined.csv')
df.set_index('Date', inplace=True)
df_corr = df.pct_change().corr()
print(df_corr.head())
data = df_corr.values
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
# heatmap = ax.pcolor(data, cmap=plt.cm.get_cmap('RdYlGn'))
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
fig.colorbar(heatmap)
ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
ax.invert_yaxis()
ax.xaxis.tick_top()
for y in range(data.shape[0]):
for x in range(data.shape[1]):
plt.text(x + 0.5, y + 0.5, '%.4f' % data[y, x],
horizontalalignment='center',
verticalalignment='center',
)
column_labels = df_corr.columns
row_labels = df_corr.index
ax.set_xticklabels(column_labels)
ax.set_yticklabels(row_labels)
plt.xticks(rotation=90)
heatmap.set_clim(-1,1)
plt.tight_layout()
plt.show()
visualize_data()
Instead of '%.4f' % data[y, x], you can try using something like
'{0:.2f}'.format(data[y,x])

Seaborn boxplot custom lables aside box

I have the code segment given below, and it generates the provided boxplot. I would like to know how to add custom labels aside each box, so that the boxplot is even more digestible to the readers of my result. The expected diagram is also provided. I reckon there should be an easy way to get this done in Seaborn/Matplotlib.
What I exactly want is to add the following labels to each box (on left hand side as in shown in the example provided)
The code use to generate boxplot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as MaxNLocator
from matplotlib import rcParams
from matplotlib.ticker import ScalarFormatter, FuncFormatter,FormatStrFormatter, EngFormatter#, mticker
%matplotlib inline
import seaborn as sns
range_stats = pd.read_csv(f'{snappy_data_dir}range_searcg_snappy_stats.csv')
data_stats_rs_txt = range_stats[range_stats['category'] == "t"]
data_stats_rs_seq = range_stats[range_stats['category'] == "s"]
fig, ax =plt.subplots(1,2)
rcParams['figure.figsize'] =8, 6
flierprops = dict(marker='x')
labels1 = ['R1', 'R2', 'R3', 'R4', 'R5']
sns.boxplot(x='Interval',y='Total',data=data_stats_rs_txt,palette='rainbow', ax=ax[0])
sns.boxplot(x='Interval',y='Total',data=data_stats_rs_seq,palette='rainbow', ax=ax[1])
ax[0].set(xlabel='Interval (s)', ylabel='query execution time (s)', title='Text format', ylim=(0, 290))
ax[1].set(xlabel='Interval (s)', ylabel='', title='Proposed format',ylim=(0, 290), yticklabels=[])
plt.savefig("range-query-corrected.svg")
plt.savefig('snappy_compressed_rangesearch.pdf')
Resulted figure:
Expected figure with labels
This might help you, although it is not a fully correct way and is not a complete solution.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
tips = sns.load_dataset('tips')
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.set_context('poster',font_scale=0.5)
sns.boxplot(x="day", y="total_bill", data=tips,palette='rainbow', ax=axes[0], zorder=0)
axes[0].text(0, 45, r"$B1$", fontsize=20, color="blue")
axes[0].text(0.9, 45, r"$B2$", fontsize=20, color="blue")
axes[0].text(2.2, 45, r"$B3$", fontsize=20, color="blue")
axes[0].text(3.1, 45, r"$B4$", fontsize=20, color="blue");
sns.boxplot(x="day", y="tip", data=tips,palette='rainbow', ax=axes[1], zorder=10)
iris = sns.load_dataset("iris")
x_var = 'species'
y_var = 'sepal_width'
x_order = ['setosa', 'versicolor', 'virginica']
labels = ['R1','R2','R3']
max_vals = iris.groupby(x_var).max()[y_var].reindex(x_order)
ax = sns.boxplot(x=x_var, y=y_var, data=iris)
for x,y,l in zip(range(len(x_order)), max_vals, labels):
ax.annotate(l, xy=[x,y], xytext=[0,5], textcoords='offset pixels', ha='center', va='bottom')

Setting xticklabels, x axis formatting in matplotlib

I would like to format my x axis with the legend values at the mid point of each bar whilst retaining the gender group identification. I'd like lower the gender groups to sit below the other xticklabels for clarity.
To this point, I've added xticks but actually labeling them correctly and neatly is proving trickier.
from itertools import chain, cycle
import logging
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
matplotlib.style.use("ggplot")
m = {"Males" : {"Yes": 2, "No": 8}}
w = {"Females": {"Yes": 3, "No": 7}}
data = {**m, **w}
df = DataFrame(data)
# relative freq table
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
plt.show()
The following might be what you're looking for.
from itertools import chain
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame
matplotlib.style.use("ggplot")
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
labels = [l for l in ax.get_xticklabels()]
for i,l in enumerate(labels[len(df_ft):]):
l.set_text(df_ft.columns[i % len(df_ft.columns)])
for i,l in enumerate(labels[:len(df_ft)]):
l.set_text("\n"+l.get_text())
ax.set_xticklabels(labels)
plt.savefig(__file__+".png")
plt.show()
Altair would do a great job here.
from altair import *
from pandas import DataFrame
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df = df.stack().reset_index()
df.columns=['response','gender','count']
Vis #1
Chart(df).mark_bar().encode(x='gender',y='count',color='response').configure_cell(width=200, height=200)
Vis 2
Chart(df).mark_bar().encode(x=X('response', axis=False),
y=Y('count', axis=Axis(grid=False)),
color='response',
column=Column('gender', axis=Axis(axisWidth=1.0, offset=-8.0, orient='bottom'),scale=Scale(padding=30.0))).configure_cell(width=200, height=200).configure_facet_cell(strokeWidth=0)

controlling the number of x ticks in pyplot

I want to display all 13 x ticks, but the graph only shows 7 of them having two intervals.
plt.locator_params(axis='x',nbins=13)
Why doesn't above code work??
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as dates
y = [0, 0.86, 0.826, 0.816, 0.807, 0.803, 0.804, 0.803, 0.802,0.81, 0.813, 0.813, 0.813]
times = pd.date_range('2015-02-25', periods=13)
fig, ax = plt.subplots(1)
fig.autofmt_xdate()
xfmt = dates.DateFormatter('%d-%m-%y')
ax.xaxis.set_major_formatter(xfmt)
plt.locator_params(axis='x',nbins=13)
ax.plot_date(times.to_pydatetime(), y, 'v-')
ax.xaxis.set_minor_locator(dates.WeekdayLocator(byweekday=(1),
interval=1))
ax.xaxis.set_minor_formatter(dates.DateFormatter('%d\n%a'))
ax.xaxis.grid(True, which="minor")
ax.yaxis.grid()
plt.tight_layout()
plt.show()
The warning should give you some clue why this is happening:
UserWarning: 'set_params()' not defined for locator of type <class 'pandas.tseries.converter.PandasAutoDateLocator'>
str(type(self)))
Use plt.xticks(times.to_pydatetime()) instead:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as dates
y = [0, 0.86, 0.826, 0.816, 0.807, 0.803, 0.804, 0.803, 0.802,0.81, 0.813, 0.813, 0.813]
times = pd.date_range('2015-02-25', periods=13)
fig, ax = plt.subplots(1)
fig.autofmt_xdate()
xfmt = dates.DateFormatter('%d-%m-%y')
ax.xaxis.set_major_formatter(xfmt)
ax.plot_date(times.to_pydatetime(), y, 'v-')
ax.xaxis.set_minor_locator(dates.WeekdayLocator(byweekday=(1),
interval=1))
plt.xticks(times.to_pydatetime())
ax.xaxis.set_minor_formatter(dates.DateFormatter('%d\n%a'))
ax.xaxis.grid(True, which="minor")
ax.yaxis.grid()
plt.tight_layout()
plt.show()