Simple data analysis with Pandas - how to remove repetitive code - pandas

Here's an example:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import numpy as np
dfboth = {
'I': [1,2,3,4,5,6],
'S': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
'DVAR': [800, 300, 820, 330, 910, 350],
'CVAR': [1001, 612, 990, 639, 600, 130]}
dfboth = pd.DataFrame(dfboth)
dfboth = dfboth.assign(DVARCHANGE=dfboth['DVAR'].diff(2))
dfboth = dfboth.assign(CVARCHANGE=dfboth['CVAR'].diff(2))
plt.rcParams["figure.figsize"] = (24, 9) # (w, h)
plt.subplot(2,2,1)
plt.plot('I','DVAR', data=dfboth[dfboth.S=="X"])
plt.plot('I','DVARCHANGE', data=dfboth[dfboth.S=="X"])
plt.title("X-D")
plt.legend()
plt.subplot(2,2,2)
plt.plot('I','DVAR', data=dfboth[dfboth.S=="Y"])
plt.plot('I','DVARCHANGE', data=dfboth[dfboth.S=="Y"])
plt.title("Y-D")
plt.legend()
plt.subplot(2,2,3)
plt.plot('I','CVAR', data=dfboth[dfboth.S=="X"])
plt.plot('I','CVARCHANGE', data=dfboth[dfboth.S=="X"])
plt.title("X-C")
plt.legend()
plt.subplot(2,2,4)
plt.plot('I','CVAR', data=dfboth[dfboth.S=="Y"])
plt.plot('I','CVARCHANGE', data=dfboth[dfboth.S=="Y"])
plt.title("Y-C")
plt.legend()
I have a series of data points (a time series), I=1,2,3 ... and they each pertain to a certain 'S', in this example, X and Y. For each reading, we have two variables DVAR and CVAR. I am trying to make this graph
I compare for S X and Y, DVAR and it's change from the previous reading, and CVAR and it's change in previous reading.
You can also see annoying repetition. But I actually have 12 S's not just X and Y. And I have more variables.
I believe there's a much better way of doing this than I have written using either stacked indexes or some kind of pivot table. But I've not been able to figure it out!

You can use a for-loop:
plot_titles = ["X-D", "Y-D", "X-C", "Y-C"]
y1 = ['DVAR', 'DVAR', 'CVAR', 'CVAR']
y2 = [y + 'CHANGE' for y in y1]
data1 = ["X", "Y", "X", "Y"]
for i in range(4):
plt.subplot(2, 2, i+1)
plt.plot('I', y1[i], data = dfboth[dfboth.S == data1[i]])
plt.plot('I', y2[i], data = dfboth[dfboth.S == data1[i]])
plt.title(plot_titles[i])
plt.legend()

Related

python - generalizing y-axis limits for mean line in density plots

I have this simple dataframe:
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
And as I have many columns (all of them numeric), I did this loop in order to do a specific plot:
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.vlines(x=data.mean(),ymin=0, ymax=0.01, linestyles="dotted")
plt.show()
However, I'm having trouble trying to generalize the ymax argument of plt.vlines(), as I need to get the maximum y-axis value of each density plot in order to plot the mean vline of each plot accordingly. I have tried with np.argmax(), but it doesn't seem to work.
Any suggestions?
pandas.DataFrame.plot() returns matplotlib.axes.Axes object. You can use get_ylim() function to get ymin and ymax.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": np.random.randint(50,53,size=100),
"Y": np.random.randint(200,300,size=100),
"Z": np.random.randint(400,800,size=100)})
for i in df.columns:
data = df[i]
ax = data.plot(kind="kde")
ymin, ymax = ax.get_ylim()
plt.vlines(x=data.mean(),ymin=ymin, ymax=ymax, linestyles="dotted")
plt.show()
To get the value of the kde corresponding to the mean, you could extract the curve from the plot and interpolate it at the position of the mean:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({"X": 20 + np.random.randint(-1, 2, size=100).cumsum(),
"Y": 30 + np.random.randint(-1, 2, size=100).cumsum(),
"Z": 40 + np.random.randint(-1, 2, size=100).cumsum()})
fig, ax = plt.subplots()
for col in df.columns:
data = df[col]
data.plot(kind="kde", ax=ax)
x = data.mean()
kdeline = ax.lines[-1]
ymax = np.interp(x, kdeline.get_xdata(), kdeline.get_ydata())
ax.vlines(x=data.mean(), ymin=0, ymax=ymax, linestyles="dotted")
ax.set_ylim(ymin=0) # ax.vlines() moves the bottom ylim; set it back to 0
plt.show()
Use plt.axvline. You specify the limits as numbers in the range [0,1], 0 being the bottom of the plot, 1 being the top.
for i in df.columns:
data = df[i]
data.plot(kind="kde")
plt.axvline(data.mean(), 0, 1, linestyle='dotted', color='black')
plt.show()

Delete individual legends from subplots in panda

I would like to remove legends from individual subplots in panda. I created a bar chart and the #subplots. I would like to keep the titles of each subplot and remove the legends since they show the #same verbiage. I have tried several techniques, and even some that has me calling on each individual #subplot but am sure there is a simple solution. The fourth result image below is the one I need help with.
Here is my code so far:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
mouse_metadata = "Mouse_metadata.csv"
study_results = "Study_results.csv"
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)
study_data_combined = pd.merge(mouse_metadata,study_results, on= "Mouse ID")
pyma_sd = study_data_combined
pyma_sd.head()
pyma_sd_grouped = pyma_sd.groupby(["Drug Regimen"])
pyma_sd_grouped_mean = pyma_sd_grouped["Tumor Volume (mm3)"].mean()
pyma_sd_grouped_median = pyma_sd_grouped["Tumor Volume (mm3)"].median()
pyma_sd_grouped_variance = pyma_sd_grouped["Tumor Volume (mm3)"].var()
pyma_sd_grouped_std = pyma_sd_grouped["Tumor Volume (mm3)"].std()
pyma_sd_grouped_sem = pyma_sd_grouped["Tumor Volume (mm3)"].sem()
pyma_sd_grouped_stats = pd.DataFrame({ "Mean":pyma_sd_grouped_mean,
"Median": pyma_sd_grouped_median,
"Variance": pyma_sd_grouped_variance,
"Standard Error of Mean ": pyma_sd_grouped_sem})
print(" ","Stats of Tumor Volume")
print(pyma_sd_grouped_stats)
chart_pyma_sd_grouped_stats = pyma_sd_grouped_stats.plot(kind='bar', rot=50, figsize = (10, 6),
width = .8)
plt.title("Stats on Drug Regimen")Output 2
plt.xlabel("Drug Regimen")
plt.ylabel("Stats per Drug Regimen")
plt.tight_layout()
plt.show()
axes = pyma_sd_grouped_stats.plot.bar(rot=50, subplots=True, figsize = (10, 6), width = .75,)
axes[1].legend(loc=1)
plt.subplots_adjust(hspace=0.5)
plt.show()
**
Simply supply legend=False in your call to DataFrame.plot.bar.
import matplotlib.pyplot as plt
import pandas as pd
speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant', 'rabbit', 'giraffe', 'coyote', 'horse']
df = pd.DataFrame({'speed': speed, 'lifespan': lifespan}, index=index)
axes = df.plot.bar(rot=0, subplots=True, legend=False)
plt.show()
Compare the image above to the one generated in the doc.

Custom xticks labels in loglog plot

A simple example is as follows:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
N = 1000
x = np.linspace(1, 5, N)
y = npr.randint(1e16, size = N) / 1e16
y = np.sort(y)
fig, ax = plt.subplots()
ax.loglog(x, y, '.')
ax.grid(True, 'both')
Where I want to replace the xticks. So far everything I tried, failed to work:
ax.set_xticks([2, 3, 4], ['a', 'b', 'c'])
ax.xaxis.set_ticks_position('none')
ax.set_xticks([])
None of the above showed any effect. My goal is to replace the ticks with custom defined ticks (strings or integers). So instead of 2 x 10⁰ it should only be 2. Similar for other xticks.
Probably this is what you're after:
import numpy as np
import matplotlib.pyplot as plt
N = 1000
x = np.linspace(1, 5, N)
y = np.random.rand(N)
y = np.sort(y)
fig, ax = plt.subplots()
ax.loglog(x, y, '.')
ax.grid(True, 'both')
ax.set_xticks([2, 3, 4])
ax.set_xticklabels(['a', 'b', 'c'])
ax.minorticks_off()
plt.show()

Plotting lists with different number of elements in matplotlib

I have a list of numpy arrays, each potentially having a different number of elements, such as:
[array([55]),
array([54]),
array([], dtype=float64),
array([48, 55]),]
I would like to plot this, where each array has an abscissa (x value) assigned, such as [1,2,3,4] so that the plot should show the following points: [[1,55], [2, 54], [4, 48], [4, 55]].
Is there a way I can do that with matplotlib? or how can I transform the data with numpy or pandas first so that it is can be plotted?
What you want to do is chain the original array and generate a new array with "abscissas". There are many way to concatenated, one of the most efficient is using itertools.chain.
import itertools
from numpy import array
x = [array([55]), array([54]), array([]), array([48, 55])]
ys = list(itertools.chain(*x))
# this will be [55, 54, 48, 55]
# generate abscissas
xs = list(itertools.chain(*[[i+1]*len(x1) for i, x1 in enumerate(x)]))
Now you can just plot easily with matplotlib as below
import matplotlib.pyplot as plt
plt.plot(xs, ys)
If you want to have different markers for different groups of data (the colours are automatically cycled by matplotlib):
import numpy as np
import matplotlib.pyplot as plt
markers = ['o', #'circle',
'v', #'triangle_down',
'^', #'triangle_up',
'<', #'triangle_left',
'>', #'triangle_right',
'1', #'tri_down',
'2', #'tri_up',
'3', #'tri_left',
'4', #'tri_right',
'8', #'octagon',
's', #'square',
'p', #'pentagon',
'h', #'hexagon1',
'H', #'hexagon2',
'D', #'diamond',
'd', #'thin_diamond'
]
n_markers = len(markers)
a = [10.*np.random.random(int(np.random.random()*10)) for i in xrange(n_markers)]
fig = plt.figure()
ax = fig.add_subplot(111)
for i, data in enumerate(a):
xs = data.shape[0]*[i,] # makes the abscissas list
marker = markers[i % n_markers] # picks a valid marker
ax.plot(xs, data, marker, label='data %d, %s'%(i, marker))
ax.set_xlim(-1, 1.4*len(a))
ax.set_ylim(0, 10)
ax.legend(loc=None)
fig.tight_layout()
Notice the limits to y scale are hard coded, change accordingly. The 1.4*len(a) is meant to leave room on the right side of the graph for the legend.
The example above has no point in the x=0 (would be dark blue circles) as the randomly assigned size for its data set was zero, but you can easily place a +1 if you don't want to use x=0.
Using pandas to create a numpy array with nans inserted when an array is empty or shorter than the longest array in the list...
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
arr_list = [np.array([55]),
np.array([54]),
np.array([], dtype='float64'),
np.array([48, 55]),]
df = pd.DataFrame(arr_list)
list_len = len(df)
repeats = len(list(df))
vals = df.values.flatten()
xax = np.repeat(np.arange(list_len) + 1, repeats)
df_plot = pd.DataFrame({'xax': xax, 'vals': vals})
plt.scatter(df_plot.xax, df_plot.vals);
with x your list :
[plt.plot(np.repeat(i,len(x[i])), x[i],'.') for i in range(len(x))]
plt.show()
#Alessandro Mariani's answer based on itertools made me think of another way to generate an array containg the data I needed. In some cases it may be more compact. It is also based on itertools.chain:
import itertools
from numpy import array
y = [array([55]), array([54]), array([]), array([48, 55])]
x = array([1,2,3,4])
d = array(list(itertools.chain(*[itertools.product([t], n) for t, n in zip(x,y)])))
d is now the following array:
array([[ 1, 55],
[ 2, 54],
[ 4, 48],
[ 4, 55]])

Setting xticklabels, x axis formatting in matplotlib

I would like to format my x axis with the legend values at the mid point of each bar whilst retaining the gender group identification. I'd like lower the gender groups to sit below the other xticklabels for clarity.
To this point, I've added xticks but actually labeling them correctly and neatly is proving trickier.
from itertools import chain, cycle
import logging
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
matplotlib.style.use("ggplot")
m = {"Males" : {"Yes": 2, "No": 8}}
w = {"Females": {"Yes": 3, "No": 7}}
data = {**m, **w}
df = DataFrame(data)
# relative freq table
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
plt.show()
The following might be what you're looking for.
from itertools import chain
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame
matplotlib.style.use("ggplot")
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df_ft = (df / df.sum() * 100).T
ax = plt.subplot(111)
df_ft.plot(ax=ax, kind="bar", ylim=(0, 90),
title="Would you prefer to work at home? (10 males, 10 females)",
rot=0)
plt.ylabel("Relative Frequency (%)")
midp = 0.125 # standard bar width/2
t_l = ax.get_xticks().tolist()
ticks = list(chain.from_iterable((t - midp, t + midp) for t in t_l))
ax.set_xticks(t_l + ticks)
labels = [l for l in ax.get_xticklabels()]
for i,l in enumerate(labels[len(df_ft):]):
l.set_text(df_ft.columns[i % len(df_ft.columns)])
for i,l in enumerate(labels[:len(df_ft)]):
l.set_text("\n"+l.get_text())
ax.set_xticklabels(labels)
plt.savefig(__file__+".png")
plt.show()
Altair would do a great job here.
from altair import *
from pandas import DataFrame
df = DataFrame({'Males': {'Yes': 2, 'No': 8}, 'Females': {'Yes': 3, 'No': 7}})
df = df.stack().reset_index()
df.columns=['response','gender','count']
Vis #1
Chart(df).mark_bar().encode(x='gender',y='count',color='response').configure_cell(width=200, height=200)
Vis 2
Chart(df).mark_bar().encode(x=X('response', axis=False),
y=Y('count', axis=Axis(grid=False)),
color='response',
column=Column('gender', axis=Axis(axisWidth=1.0, offset=-8.0, orient='bottom'),scale=Scale(padding=30.0))).configure_cell(width=200, height=200).configure_facet_cell(strokeWidth=0)