Setting xticklabels, x axis formatting in matplotlib - matplotlib

I would like to format my x axis with the legend values at the mid point of each bar whilst retaining the gender group identification. I'd like lower the gender groups to sit below the other xticklabels for clarity.
To this point, I've added xticks but actually labeling them correctly and neatly is proving trickier.
from itertools import chain, cycle
import logging
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
matplotlib.style.use("ggplot")
m = {"Males" : {"Yes": 2, "No": 8}}
w = {"Females": {"Yes": 3, "No": 7}}
data = {**m, **w}
df = DataFrame(data)
# relative freq table
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
plt.show()

The following might be what you're looking for.
from itertools import chain
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame
matplotlib.style.use("ggplot")
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
labels = [l for l in ax.get_xticklabels()]
for i,l in enumerate(labels[len(df_ft):]):
l.set_text(df_ft.columns[i % len(df_ft.columns)])
for i,l in enumerate(labels[:len(df_ft)]):
l.set_text("\n"+l.get_text())
ax.set_xticklabels(labels)
plt.savefig(__file__+".png")
plt.show()

Altair would do a great job here.
from altair import *
from pandas import DataFrame
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df = df.stack().reset_index()
df.columns=['response','gender','count']
Vis #1
Chart(df).mark_bar().encode(x='gender',y='count',color='response').configure_cell(width=200, height=200)
Vis 2
Chart(df).mark_bar().encode(x=X('response', axis=False),
y=Y('count', axis=Axis(grid=False)),
color='response',
column=Column('gender', axis=Axis(axisWidth=1.0, offset=-8.0, orient='bottom'),scale=Scale(padding=30.0))).configure_cell(width=200, height=200).configure_facet_cell(strokeWidth=0)

Related

who to plot stats.probplot in a grid?

I have a data frame with four columns I would like to plot the normality test for each column in a 2*2 grid, but it only plot one figure, and the else is empty.
import random
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2,2, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
data = {'col1': [random.randrange(1, 50, 1) for i in range(1000)], 'col2': [random.randrange(1, 50, 1) for i in range(1000)],'col3':[random.randrange(1, 50, 1) for i in range(1000)]
,'col4':[random.randrange(1, 50, 1) for i in range(1000)]}
df = pd.DataFrame(data)
for ax, d in zip(axs.ravel(), df):
ax=stats.probplot(df[d], plot=plt)
#ax.set_title(str(d))
plt.show()
is there a way to construct the subplot and the stats.probplot within a loop?
In your code, you need to change the for loop to this:
for ax, d in zip(axs.ravel(), df):
stats.probplot(df[d], plot=ax)
#ax.set_titl(str(d))
plt.show()
I hope this will help you move on.

Matplotlib - Line Plot

I am trying to plot an array of 101 rows * 12 Columns, with row #1 as a highlight using the code below:
plt.plot(HW.transpose()[1:101],color = 'grey', alpha = 0.1)
plt.plot(HW.transpose()[0],color = 'red', linewidth = 3, alpha = 0.7)
The only issue in this graph is that 'S1' somehow ends up in the last instead of beginning. What am I doing wrong?
HW.transpose()[1:101] doesn't select the desired columns. You can use HW.transpose().iloc[:, 1:101] instead:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
HW = pd.DataFrame(np.random.randn(101, 12).cumsum(axis=1), columns=[f'S{i}' for i in range(1, 13)])
plt.plot(HW.transpose().iloc[:, 1:101], color='grey', alpha=0.1)
plt.plot(HW.transpose().iloc[:, 0], color='red', linewidth=3, alpha=0.7)
plt.show()

python - generalizing y-axis limits for mean line in density plots

I have this simple dataframe:
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
And as I have many columns (all of them numeric), I did this loop in order to do a specific plot:
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.vlines(x=data.mean(),ymin=0, ymax=0.01, linestyles="dotted")
plt.show()
However, I'm having trouble trying to generalize the ymax argument of plt.vlines(), as I need to get the maximum y-axis value of each density plot in order to plot the mean vline of each plot accordingly. I have tried with np.argmax(), but it doesn't seem to work.
Any suggestions?
pandas.DataFrame.plot() returns matplotlib.axes.Axes object. You can use get_ylim() function to get ymin and ymax.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
for i in df.columns:
data = df[i]
ax = data.plot(kind="kde")
ymin, ymax = ax.get_ylim()
plt.vlines(x=data.mean(),ymin=ymin, ymax=ymax, linestyles="dotted")
plt.show()
To get the value of the kde corresponding to the mean, you could extract the curve from the plot and interpolate it at the position of the mean:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": 20 + np.random.randint(-1, 2, size=100).cumsum(),
"Y": 30 + np.random.randint(-1, 2, size=100).cumsum(),
"Z": 40 + np.random.randint(-1, 2, size=100).cumsum()})
fig, ax = plt.subplots()
for col in df.columns:
data = df[col]
data.plot(kind="kde", ax=ax)
x = data.mean()
kdeline = ax.lines[-1]
ymax = np.interp(x, kdeline.get_xdata(), kdeline.get_ydata())
ax.vlines(x=data.mean(), ymin=0, ymax=ymax, linestyles="dotted")
ax.set_ylim(ymin=0) # ax.vlines() moves the bottom ylim; set it back to 0
plt.show()
Use plt.axvline. You specify the limits as numbers in the range [0,1], 0 being the bottom of the plot, 1 being the top.
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.axvline(data.mean(), 0, 1, linestyle='dotted', color='black')
plt.show()

How to annotate in 2 decimal places using Matplotlib

I am trying to create a heatmap displaying correlation coefficient values. I'm quite new at this, but the code below would annotate in multiple decimal places, whereas i'm trying to narrow down to 2 d.p.
Does anyone have experience with this?
import pandas_datareader.data as web
import pandas as pd
import datetime as dt
import csv
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import seaborn as sns
style.use('ggplot')
def visualize_data():
df = pd.read_csv('sti_joined.csv')
df.set_index('Date', inplace=True)
df_corr = df.pct_change().corr()
print(df_corr.head())
data = df_corr.values
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
# heatmap = ax.pcolor(data, cmap=plt.cm.get_cmap('RdYlGn'))
heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
fig.colorbar(heatmap)
ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
ax.invert_yaxis()
ax.xaxis.tick_top()
for y in range(data.shape[0]):
for x in range(data.shape[1]):
plt.text(x + 0.5, y + 0.5, '%.4f' % data[y, x],
horizontalalignment='center',
verticalalignment='center',
)
column_labels = df_corr.columns
row_labels = df_corr.index
ax.set_xticklabels(column_labels)
ax.set_yticklabels(row_labels)
plt.xticks(rotation=90)
heatmap.set_clim(-1,1)
plt.tight_layout()
plt.show()
visualize_data()
Instead of '%.4f' % data[y, x], you can try using something like
'{0:.2f}'.format(data[y,x])

How to add droplines to a seaborn scatterplot?

Using the following example code in a Jupyter notebook:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
df = pd.DataFrame(np.random.rand(5, 2), columns=['a', 'b'])
sns.set()
g = sns.relplot(data=df, x='a', y='b', kind='scatter');
g.set(xlim=(0, 1))
g.set(ylim=(0, 1));
The resulting plot shows the data points, but I would also like to have vertical drop lines and occasionally horizontal ones as well. To clarify what I mean by droplines, here is a mockup of the actual vs. the desired output:
Update: A little more complex input that makes it harder to manually draw the lines:
import pandas as pd
import seaborn as sns
import numpy as np
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
df = pd.DataFrame(np.random.rand(20, 3), columns=['a', 'b', 'c'])
df['d'] = ['apples', 'bananas', 'cherries', 'dates'] * 5
sns.set()
g = sns.relplot(data=df, x='a', y='b', hue='c', col='d', col_wrap=2, kind='scatter');
g.set(xlim=(0, 1))
g.set(ylim=(0, 1));
There are several ways to plot vertical/horizontal lines. One of the is to use hlines or vlines. This can be done using a loop for sake of ease.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(121)
fig, ax = plt.subplots()
df = pd.DataFrame(np.random.rand(5, 2), columns=['a', 'b'])
sns.set()
g = sns.relplot(data=df, x='a', y='b', kind='scatter', color='blue', ax=ax);
for x, y in zip(df['a'], df['b']):
ax.hlines(y, 0, x, color='blue')
ax.vlines(x, 0, y, color='blue')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
plt.close(g.fig)