Stacked bar plot - percentage - pandas

I want to represent this information in a stacked bar plot in percentage
On the x axis I want the age groups and on the y axis and the values that represent percentage of Gender in each age group
Age is represented by bins in the dataset
I have this so far
This is my code:
c = ds.groupby(['Age','Gender'])['Gender'].count()
d=(((c /c.groupby(level=0).sum())*100).round()).astype('int64')
d

I created a test data frame:
df = pd.DataFrame({'Gender': ['F','M','F','F','F','M','M','M','F','F','M','F','F','M','M','M','M','F','F','M','M','M'], 'Age': [17,10,20,51,53,15,50,60,43,28,35,67,33,17,20,40,43,47,48,51,53,54]})
You can use pandas.cut function to segment the age into proper intervals:
bins = pd.IntervalIndex.from_tuples([(0,17),(17,25),(25,35),(35,46),(46,50),(50,55), (55,np.inf)])
df['Age_interval'] = pd.cut(df['Age'], bins=bins)
df = df.groupby(['Age_interval', 'Gender']).size().unstack().fillna(0)
df['F'] = df['F']/sum(df['F']+df['M'])*100
df['M'] = df['M']/sum(df['M']+df['F'])*100
df['Age'] = ['0-17', '18-25','26-35', '36-45', '46-50', '51-55', '55-']
df.plot(kind='bar', x='Age', title = 'Gender distribution in Age groups', rot=0,figsize=(10,5), color=['turquoise','brown'], stacked=True)

Related

How to expand bars over the month on the x-axis while being the same width?

for i in range(len(basin)):
prefix = "URL here"
state = "OR"
basin_name = basin[i]
df_orig = pd.read_csv(f"{prefix}/{basin_name}.csv", index_col=0)
#---create date x-index
curr_wy_date_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 9, 30),
freq="D",
)
if not calendar.isleap(curr_wy):
print("dropping leap day")
df_orig.drop(["02-29"], inplace=True)
use_cols = ["Median ('91-'20)", f"{curr_wy}"]
df = pd.DataFrame(data=df_orig[use_cols].copy())
df.index = curr_wy_date_rng
#--create EOM percent of median values-------------------------------------
curr_wy_month_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 6, 30),
freq="M",
)
df_monthly_prec = pd.DataFrame(data=df_monthly_basin[basin[i]].copy())
df_monthly_prec.index = curr_wy_month_rng
df_monthly = df.groupby(pd.Grouper(freq="M")).max()
df_monthly["date"] = df_monthly.index
df_monthly["wy_date"] = df_monthly["date"].apply(lambda x: cal_to_wy(x))
df_monthly.index = pd.to_datetime(df_monthly["wy_date"])
df_monthly.index = df_monthly["date"]
df_monthly["month"] = df_monthly["date"].apply(
lambda x: calendar.month_abbr[x.month]
)
df_monthly["wy"] = df_monthly["wy_date"].apply(lambda x: x.year)
df_monthly.sort_values(by="wy_date", axis=0, inplace=True)
df_monthly.drop(
columns=[i for i in df_monthly.columns if "date" in i], inplace=True
)
# df_monthly.index = df_monthly['month']
df_merge = pd.merge(df_monthly,df_monthly_prec,how='inner', left_index=True, right_index=True)
#---Subplots---------------------------------------------------------------
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(df_merge.index, df_merge["Median ('91-'20)"], color="green", linewidth="1", linestyle="dashed", label = 'Median Snowpack')
ax.plot(df_merge.index, df_merge[f'{curr_wy}'], color='red', linewidth='2',label='WY Current')
#------Seting x-axis range to expand bar width for ax2
ax.bar(df_merge.index,df_merge[basin[i]], color = 'blue', label = 'Monthly %')
#n = n + 1
#--format chart
ax.set_title(chart_name[w], fontweight = 'bold')
w = w + 1
ax.set_ylabel("Basin Precipitation Index")
ax.set_yticklabels([])
ax.margins(x=0)
ax.legend()
#plt.xlim(0,9)
#---Setting date format
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
#---EXPORT
plt.show()
End result desired: Plotting both the monthly dataframe (df_monthly_prec) with the daily dataframe charting only monthly values (df_monthly). The bars for the monthly DataFrame should ideally span the whole month on the chart.
I have tried creating a secondary axis, but had trouble aligning the times for the primary and secondary axes. Ideally, I would like to replace plotting df_monthly with df (showing all daily data instead of just the end-of-month values within the daily dataset).
Any assistance or pointers would be much appreciated! Apologies if additional clarification is needed.

discrete numpy array to continuous array

I have some discrete data in an array, such that:
arr = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
whose plot looks like:
I also have an index array, such that each unique value in arr is associated with a unique index value, like:
ind = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
What is the most pythonic way of converting arr from discrete values to continuous values, so that the array would look like this when plotted?:
therefore, interpolating between the discrete points to make continuous data
I found a solution to this if anyone has a similar issue. It is maybe not the most elegant so modifications are welcome:
def ref_linear_interp(x, y):
arr = []
ux=np.unique(x) #unique x values
for u in ux:
idx = y[x==u]
try:
min = y[x==u-1][0]
max = y[x==u][0]
except:
min = y[x==u][0]
max = y[x==u][0]
try:
min = y[x==u][0]
max = y[x==u+1][0]
except:
min = y[x==u][0]
max = y[x==u][0]
if min==max:
sub = np.full((len(idx)), min)
arr.append(sub)
else:
sub = np.linspace(min, max, len(idx))
arr.append(sub)
return np.concatenate(arr, axis=None).ravel()
y = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
x = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
z = np.arange(1, 16, 1)
Here is an answer for the symmetric solution that I would expect when reading the question:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# create the data as described
numbers = [1,2,3,2,1]
nblock = 3
df = pd.DataFrame({
"x": np.arange(nblock*len(numbers)),
"y": np.repeat(numbers, nblock),
"label": np.repeat(np.arange(len(numbers)), nblock)
})
Expecting a constant block size of 3, we could use a rolling window:
df['y-smooth'] = df['y'].rolling(nblock, center=True).mean()
# fill NaNs
df['y-smooth'].bfill(inplace=True)
df['y-smooth'].ffill(inplace=True)
plt.plot(df['x'], df['y-smooth'], marker='*')
If the block size is allowed to vary, we could determine the block centers and interpolate piecewise.
centers = df[['x', 'y', 'label']].groupby('label').mean()
df['y-interp'] = np.interp(df['x'], centers['x'], centers['y'])
plt.plot(df['x'], df['y-interp'], marker='*')
Note: You may also try
centers = df[['x', 'y', 'label']].groupby('label').min() to select the left corner of the labelled blocks.

How to write a custom function to sort and pivot dataframe in python

In the dataframe below
I want to write a def function that takes in a data frame and does the following :
select Location , Group, Income_Yr1 : Income_Yr3 columns
sort the dataframe using the Group column from lowest to the highest value
create a pivot table for mean,median and standard deviation ( making it 3 tables per Income_Yr or possibly combine them as one ) for Income_Yr1, Income_Yr2 and Income_Yr3
# DataFrame using arrays.
import pandas as pd
import numpy as np
# initialise data of lists.
data = {'Gender':['F', 'F', 'M', 'F','M', 'F', 'M', 'M','F', 'F', 'M', 'F','M', 'F', 'M', 'M','M','F', 'F', 'M'],
'UID':[1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020],
'Location':['PHX','PHX','PHX','PHX','ATL','ATL','ATL','ATL','HOU','HOU','HOU','MIA','MIA','MIA','MIA','MIA','DEN','DEN','DEN','DEN'],
'Group':[3,3,3,3,4,4,4,4,1,1,1,1,2,2,2,2,5,5,5,5],
'Income_Yr1':[32112,34214,45575,22106,32612,34216,47515,22906,32112,34511,45525,12106,52112,54214,45015,22986,32112,34214,47518,22175],
'Income_Yr2':[52112,54215,65515,72109,52616,64217,77515,52906,52145,38512,65516,32157,63152,57218,51017,42997,38125,36253,49589,32598],
'Income_Yr3':[52143,54239,65557,72116,52660,64273,77551,52969,52500,38201,65169,32795,63288,57180,51173,42970,38205,36301,59591,32580]}
df = pd.DataFrame(data)
See below for my attempt, I am open to other approaches
# read in the dataset
def pivot_table (data):
#1. import dataset and select the desired columns, I want to include all column names with string 'Income'
df1 = df[['Group','Location','Income_Yr1':'Income_Yr3']]
#2 sort the data using 'Group' column
df1 = df1.sort_values('Group')
#3a create pivot table for mean
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr1',columns = 'Location',margins = True)
#3b create pivot table for median
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr1',columns = 'Location',aggfunc = 'median', margins = True)
#3c create pivot table for std
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr1',columns = 'Location',aggfunc = np.std, margins = True)
#3d Income_Yr2: create pivot table for mean
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr2',columns = 'Location',margins = True)
#3e Income_Yr2: create pivot table for median
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr2',columns = 'Location',aggfunc = 'median', margins = True)
#3f Income_Yr2 create pivot table for std
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr2',columns = 'Location',aggfunc = np.std, margins = True)
#3g Income_Yr3: create pivot table for mean
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr3',columns = 'Location',margins = True)
#3h Income_Yr3: create pivot table for median
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr3',columns = 'Location',aggfunc = 'median', margins = True)
#3i Income_Yr3 create pivot table for std
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr3',columns = 'Location',aggfunc = np.std, margins = True)
##########
#test code
pivot_table(df)
Thanks
Let's use melt then groupby
We can use a dictionary comprehension then to sort and split your dataframes.
df1 = pd.melt(df,
id_vars=['Group','Location'],
value_vars=df.filter(like='Income').columns.tolist()
).sort_values('value') # default is lowest to highest.
df2 = df1.groupby(['Group','Location','variable'])['value'].agg(['mean','median','std'])
#now for your split dataframes.
out = {income_yr : frame for income_yr, frame in df2.groupby(level=-1)}
print(out['Income_Yr2'])
mean median std
Group Location variable
1 HOU Income_Yr2 52057.666667 52145.0 13502.211831
MIA Income_Yr2 32157.000000 32157.0 NaN
2 MIA Income_Yr2 53596.000000 54117.5 8629.910428
3 PHX Income_Yr2 60987.750000 59865.0 9466.207878
4 ATL Income_Yr2 61813.500000 58561.5 11779.239888
5 DEN Income_Yr2 39141.250000 37189.0 7333.583770
Function
Not sure of the benefit of a function here unless you have very complicated data pipelines or need to re-use this piece of code in many places but this should work,
import pandas as pd
from typing import Dict
def transform_and_split_data(data: pd.DataFrame) -> Dict[str,pd.DataFrame]:
df1 = pd.melt(data,
id_vars=['Group','Location'],
value_vars=data.filter(like='Income').columns.tolist()
).sort_values('value') # default is lowest to highest.
df2 = df1.groupby(['Group','Location','variable'])['value'].agg(['mean','median','std'])
return {income_yr : frame for income_yr, frame in df2.groupby(level=-1)}

Is there any other way to find percentage and plot a group bar-chart without using matplotlib?

emp_attrited = pd.DataFrame(df[df['Attrition'] == 'Yes'])
emp_not_attrited = pd.DataFrame(df[df['Attrition'] == 'No'])
print(emp_attrited.shape)
print(emp_not_attrited.shape)
att_dep = emp_attrited['Department'].value_counts()
percentage_att_dep = (att_dep/237)*100
print("Attrited")
print(percentage_att_dep)
not_att_dep = emp_not_attrited['Department'].value_counts()
percentage_not_att_dep = (not_att_dep/1233)*100
print("\nNot Attrited")
print(percentage_not_att_dep)
fig = plt.figure(figsize=(20,10))
ax1 = fig.add_subplot(221)
index = np.arange(att_dep.count())
bar_width = 0.15
rect1 = ax1.bar(index, percentage_att_dep, bar_width, color = 'black', label = 'Attrited')
rect2 = ax1.bar(index + bar_width, percentage_not_att_dep, bar_width, color = 'green', label = 'Not Attrited')
ax1.set_ylabel('Percenatage')
ax1.set_title('Comparison')
xTickMarks = att_dep.index.values.tolist()
ax1.set_xticks(index + bar_width)
xTickNames = ax1.set_xticklabels(xTickMarks)
plt.legend()
plt.tight_layout()
plt.show()
The first block represents how the dataset is split into 2 based upon Attrition
The second block represents the calculation of percentage of Employees in each Department who are attrited and not attrited.
The third block is to plot the given as a grouped chart.
You can do:
(df.groupby(['Department'])
['Attrited'].value_counts(normalize=True)
.unstack('Attrited')
.plot.bar()
)

Pandas: Using datetime as a condition

I'm using a for loop to plot all of the features in my dataset. I want it to skip plotting any attributes that have a datetime type. It doesn't seem to skipping correctly.....what do I need to fix?
(JFYI, I have confirmed with df.dtypes that the columns appear as datetime64[ns])
def plot_distribution(dataset, cols=5, width=20, height=50, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
g = sns.countplot(y=column, hue = target_column, data = df)
if df.dtypes[column] == np.datetime64:
continue
plot_distribution(df, cols=1, width=20, height=500, hspace=0.8, wspace=0.5)