Pandas boxplot side by side for different DataFrame - pandas

Even though there are nice examples online about plotting side by side boxplots. With the way my data is set in two different pandas DataFrames and allready having sum subplots I have not been able to manage getting my boxplots next to each other in stead of overlapping.
my code is as follows:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
mpl.use('agg')
fig, axarr = plt.subplots(3,sharex=True,sharey=True,figsize=(9,6))
month = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
percentiles = [90,95,98]
nr = 0
for p in percentiles:
future_data = pd.DataFrame(np.random.randint(0,30,size=(30,12)),columns = month)
present_data = pd.DataFrame(np.random.randint(0,30,size=(30,12)),columns = month)
Future = future_data.as_matrix()
Present = present_data.as_matrix()
pp = axarr[nr].boxplot(Present,patch_artist=True, showfliers=False)
fp = axarr[nr].boxplot(Future, patch_artist=True, showfliers=False)
nr += 1
The results looks as follows:
Overlapping Boxplots
Could you help me out in how to makes sure the boxes are next to each other so I can compare them without being bothered by the overlap?
Thank you!
EDIT: I have reduced the code somewhat so it can run like this.

You need to position your bars manually, i.e. providing the positions as array to the position argument of boxplot. Here it makes sense to shift one by -0.2 and the other by +0.2 to their integer position. You can then adjust the width of them to sum up to something smaller than the difference in positions.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
fig, axarr = plt.subplots(3,sharex=True,sharey=True,figsize=(9,6))
month = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
percentiles = [90,95,98]
nr = 0
for p in percentiles:
future_data = pd.DataFrame(np.random.randint(0,30,size=(30,12)),columns = month)
present_data = pd.DataFrame(np.random.randint(0,30,size=(30,12)),columns = month)
Future = future_data.as_matrix()
Present = present_data.as_matrix()
pp = axarr[nr].boxplot(Present,patch_artist=True, showfliers=False,
positions=np.arange(Present.shape[1])-.2, widths=0.4)
fp = axarr[nr].boxplot(Future, patch_artist=True, showfliers=False,
positions=np.arange(Present.shape[1])+.2, widths=0.4)
nr += 1
axarr[-1].set_xticks(np.arange(len(month)))
axarr[-1].set_xticklabels(month)
axarr[-1].set_xlim(-0.5,len(month)-.5)
plt.show()

Related

Barplot per each ax in matplotlib

I have the following dataset, ratings in stars for two fictitious places:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'id':['A','A','A','A','A','A','A','B','B','B','B','B','B'],
'rating':[1,2,4,5,5,5,3,1,3,3,3,5,2]})
Since the rating is a category (is not a continuous data) I convert it to a category:
df['rating_cat'] = pd.Categorical(df['rating'])
What I want is to create a bar plot per each fictitious place ('A or B'), and the count per each rating. This is the intended plot:
I guess using a for per each value in id could work, but I have some trouble to decide the size:
fig, ax = plt.subplots(1,2,figsize=(6,6))
axs = ax.flatten()
cats = df['rating_cat'].cat.categories.tolist()
ids_uniques = df.id.unique()
for i in range(len(ids_uniques)):
ax[i].bar(df[df['id']==ids_uniques[i]], df['rating'].size())
But it returns me an error TypeError: 'int' object is not callable
Perhaps it's something complicated what I am doing, please, could you guide me with this code
The pure matplotlib way:
from math import ceil
# Prepare the data for plotting
df_plot = df.groupby(["id", "rating"]).size()
unique_ids = df_plot.index.get_level_values("id").unique()
# Calculate the grid spec. This will be a n x 2 grid
# to fit one chart by id
ncols = 2
nrows = ceil(len(unique_ids) / ncols)
fig = plt.figure(figsize=(6,6))
for i, id_ in enumerate(unique_ids):
# In a figure grid spanning nrows x ncols, plot into the
# axes at position i + 1
ax = fig.add_subplot(nrows, ncols, i+1)
df_plot.xs(id_).plot(axes=ax, kind="bar")
You can simplify things a lot with Seaborn:
import seaborn as sns
sns.catplot(data=df, x="rating", col="id", col_wrap=2, kind="count")
If you're ok with installing a new library, seaborn has a very helpful countplot. Seaborn uses matplotlib under the hood and makes certain plots easier.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({'id':['A','A','A','A','A','A','A','B','B','B','B','B','B'],
'rating':[1,2,4,5,5,5,3,1,3,3,3,5,2]})
sns.countplot(
data = df,
x = 'rating',
hue = 'id',
)
plt.show()
plt.close()

matplotlib plot from dataframe but shift dates in x labels

I have this dataframe:
dates;A;B;C
2018-01-31;1;2;5
2018-02-28;1;4;3
2018-03-31;1;5;5
2018-04-30;1;6;3
2018-05-31;1;6;7
2018-06-30;1;7;3
2018-07-31;1;9;9
2018-08-31;1;2;3
2018-09-30;1;2;10
2018-10-31;1;4;3
2018-11-30;1;7;11
2018-12-31;1;2;3
I read it:
dfr = pd.read_csv('test.dat', sep=';', header = 0, index_col=0, parse_dates=True)
and then I try to plot it:
width = 5
dfr.index = pd.to_datetime(dfr.index)
x = date2num(dfr.index)
axs.bar(x-0.5*width,dfr.iloc[:,1], width=width)
axs.bar(x+0.5*width,dfr.iloc[:,2], width=width)
axs.xaxis_date()
months = dates.MonthLocator()
axs.xaxis.set_major_formatter(dates.DateFormatter(r'\textbf{%B}'))
months_f = dates.DateFormatter('%B')
axs.xaxis.set_major_locator(months)
plt.setp( axs.xaxis.get_majorticklabels(), rotation=90)
here the modules imported:
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import datetime
import pandas as pd
import matplotlib.dates as dates
and here the result:
I do not get why x label starts with 'Feb'.
I would like to have something like 'Jan,Feb,Mar...' as x labels in the x axis.
Thanks in advance
The heights of the bar charts you made do not correspond to the labelled month, i.e. the values for Feb are actually those of Jan. Therefore, the problem is in the way you labelled the axis rather than having an incorrect plot order.
I'm not so familiar with the packages you used, so I proposed a different way of making your plot:
dfr['dates'] = pd.to_datetime(dfr['dates'])
### group by months
month_vals = dfr.groupby(dfr['dates'].map(lambda x: x.month))
month_vals = sorted(month_vals, key=lambda m: m[0])
fig, axs = plt.subplots()
spacing = 0.15
### Create the list of months and the corresponding dataframes
months, df_months = zip(*month_vals)
### In your case, each month has exactly one entry, but in case there are more, sum over all of them
axs.bar([m-spacing for m in months], [df_m.loc[:,'B'].sum() for df_m in df_months], width=0.3)
axs.bar([m+spacing for m in months], [df_m.loc[:,'C'].sum() for df_m in df_months], width=0.3)
axs.set_xticks(months)
### 1900 and 1 are dummy values; we are just initializing a datetime instance here
axs.set_xticklabels([datetime.date(1900, m, 1).strftime('%b') for m in months])
Output:

Unexplained "drops" in Savgol smoothing with higher polynomial for trends, stock, energy data (all kinds of time series basically!)

I have been trying to smooth curves with Savgol (scikit) and, in several of my attempt, raising the polynomial degree resulted in "drops" like the one I show below. This example is from Google trends data, but I had similar problems with stock data and electricity consumption data. Any lead as to why it behaves like it or how to solve it (and be able to raise the polynomial degree) would be highly appreciated.
Image below: "Sample output".
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
from scipy.signal import savgol_filter
kw_list = ["Carbon footprint"]
pytrends.build_payload(kw_list, timeframe='2004-12-14 2019-12-25', geo='', gprop='')
da1 = pytrends.interest_over_time()
#(drop last one for Savgol as need odd number, used to have 196 records)
Y3 = da1["Carbon footprint"]
fig = plt.figure(figsize=(18,9))
l = Y3.shape[0]
l = l if l%2 == 1 else l-1
# window = odd number closest to size of data
ax1 = plt.subplot(2,1,1)
ax1 = sns.lineplot(data=Y3, color="navy")
#Savgol with polynomial order = 7 is fine (but misses the initial plateau)
Y3_smooth = savgol_filter(Y3,l, 7)
ax1 = sns.lineplot(x=da1.index.to_pydatetime(),y=Y3_smooth, color="red")
plt.title(f"red = with Savgol, polynomial order = 7, window = {l}", fontsize=18)
ax2 = plt.subplot(2,1,2)
ax2 = sns.lineplot(data=Y3, color="navy")
#Savgol with polynomial order = 9 or more has a weird drop
Y3_smooth = savgol_filter(Y3,l, 10)
ax2 = sns.lineplot(x=da1.index.to_pydatetime(),y=Y3_smooth, color="red")
plt.title(f"red = with Savgol, polynomial order = 10, window = {l}", fontsize=18)
Sample output
If anyone is interested, I found this workaround using a different way to smooth. It works well including in the beginning and end, and allows a fine tuning of the degree of smoothing.
from scipy.ndimage.filters import gaussian_filter1d
def smooth(y, sigma=2):
y_smooth = gaussian_filter1d(y, sigma)
return y_smooth

Seaborn how to add number of samples per category in sns.catplot

I have a catplot drawing using:
s = sns.catplot(x="level", y="value", hue="cond", kind=graph_type, data=df)
However, the size of the groups is not equal:
"Minimal" has n=12 samples , and "Moderate" has n=18 samples.
How can I add this info to the graph?
Manually calculate the sizes and add them to xticklabels, something like this
import matplotlib.pyplot as plt
import seaborn as sns
exercise = sns.load_dataset("exercise")
cnts = dict(exercise['time'].value_counts())
key = list(cnts.keys())
vals = list(cnts.values())
g = sns.catplot(x="time", y="pulse", hue="kind",order=key,
data=exercise, kind="box")
g.set_axis_labels("", "pulse")
g.set_xticklabels([(key[i]+'\n('+str(vals[i])+')') for i in range(len(key))])
plt.show()

Time series plot of categorical or binary variables in pandas or matplotlib

I have data that represent a time series of categorical variables. I want to display the transitions in categories below a traditional line plot of related continuous time series to show off context as time evolves. I'd like to know the best way to do this. My attempt was in terms of Rectangles. The appearance is a bit weird, and importantly the axis labels for the x axis don't render as dates.
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from pandas.plotting import register_matplotlib_converters
import matplotlib.dates as mdates
register_matplotlib_converters()
t0 = pd.DatetimeIndex(["2017-06-01 00:00","2017-06-17 00:00","2017-07-03 00:00","2017-08-02 00:00","2017-08-09 00:00","2017-09-01 00:00"])
t1 = pd.DatetimeIndex(["2017-06-01 00:00","2017-08-15 00:00","2017-09-01 00:00"])
df0 = pd.DataFrame({"cat":[0,2,1,2,0,1]},index = t0)
df1 = pd.DataFrame({"op":[0,1,0]},index=t1)
# Create new plot
fig,ax = plt.subplots(1,figsize=(8,3))
data_layout = {
"cat" : {0: ('bisque','Low'),
1: ('lightseagreen','Medium'),
2: ('rebeccapurple','High')},
"op" : {0: ('darkturquoise','Open'),
1: ('tomato','Close')}
}
vars =("cat","op")
dfs = [df0,df1]
all_ticks = []
leg = []
for j,(v,d) in enumerate(zip(vars,dfs)):
dvals = d[v][:].astype("d")
normal = mpl.colors.Normalize(vmin=0, vmax=2.)
colors = plt.cm.Set1(0.75*normal(dvals.as_matrix()))
handles = []
for i in range(d.count()-1):
s = d[v].index.to_pydatetime()
level = d[v][i]
base = d[v].index[i]
w = s[i+1] - s[i]
patch=mpl.patches.Rectangle((base,float(j)),width=w,color=data_layout[v][level][0],height=1,fill=True)
ax.add_patch(patch)
for lev in data_layout[v]:
print data_layout[v][level]
handles.append(mpl.patches.Patch(color=data_layout[v][lev][0],label=data_layout[v][lev][1]))
all_ticks.append(j+0.5)
leg.append( plt.legend(handles=handles,loc = (3-3*j+1)))
plt.axhline(y=1.,linewidth=3,color="gray")
plt.xlim(pd.Timestamp(2017,6,1).to_pydatetime(),pd.Timestamp(2017,9,1).to_pydatetime())
plt.ylim(0,2)
ax.add_artist(leg[0]) # two legends on one axis
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d') # This fails
plt.yticks(all_ticks,vars)
plt.show()
which produces this with no dates and has jittery lines:. How do I fix this? Is there a better way entirely?
This is a way to display dates on x-axis:
In your code substitute the line that fails with this one:
ax.xaxis.set_major_formatter((mdates.DateFormatter('%Y-%m-%d')))
But I don't remember how it should look like, can you show us the end-result again?