Is there any other way to find percentage and plot a group bar-chart without using matplotlib? - pandas

emp_attrited = pd.DataFrame(df[df['Attrition'] == 'Yes'])
emp_not_attrited = pd.DataFrame(df[df['Attrition'] == 'No'])
print(emp_attrited.shape)
print(emp_not_attrited.shape)
att_dep = emp_attrited['Department'].value_counts()
percentage_att_dep = (att_dep/237)*100
print("Attrited")
print(percentage_att_dep)
not_att_dep = emp_not_attrited['Department'].value_counts()
percentage_not_att_dep = (not_att_dep/1233)*100
print("\nNot Attrited")
print(percentage_not_att_dep)
fig = plt.figure(figsize=(20,10))
ax1 = fig.add_subplot(221)
index = np.arange(att_dep.count())
bar_width = 0.15
rect1 = ax1.bar(index, percentage_att_dep, bar_width, color = 'black', label = 'Attrited')
rect2 = ax1.bar(index + bar_width, percentage_not_att_dep, bar_width, color = 'green', label = 'Not Attrited')
ax1.set_ylabel('Percenatage')
ax1.set_title('Comparison')
xTickMarks = att_dep.index.values.tolist()
ax1.set_xticks(index + bar_width)
xTickNames = ax1.set_xticklabels(xTickMarks)
plt.legend()
plt.tight_layout()
plt.show()
The first block represents how the dataset is split into 2 based upon Attrition
The second block represents the calculation of percentage of Employees in each Department who are attrited and not attrited.
The third block is to plot the given as a grouped chart.

You can do:
(df.groupby(['Department'])
['Attrited'].value_counts(normalize=True)
.unstack('Attrited')
.plot.bar()
)

Related

How to expand bars over the month on the x-axis while being the same width?

for i in range(len(basin)):
prefix = "URL here"
state = "OR"
basin_name = basin[i]
df_orig = pd.read_csv(f"{prefix}/{basin_name}.csv", index_col=0)
#---create date x-index
curr_wy_date_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 9, 30),
freq="D",
)
if not calendar.isleap(curr_wy):
print("dropping leap day")
df_orig.drop(["02-29"], inplace=True)
use_cols = ["Median ('91-'20)", f"{curr_wy}"]
df = pd.DataFrame(data=df_orig[use_cols].copy())
df.index = curr_wy_date_rng
#--create EOM percent of median values-------------------------------------
curr_wy_month_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 6, 30),
freq="M",
)
df_monthly_prec = pd.DataFrame(data=df_monthly_basin[basin[i]].copy())
df_monthly_prec.index = curr_wy_month_rng
df_monthly = df.groupby(pd.Grouper(freq="M")).max()
df_monthly["date"] = df_monthly.index
df_monthly["wy_date"] = df_monthly["date"].apply(lambda x: cal_to_wy(x))
df_monthly.index = pd.to_datetime(df_monthly["wy_date"])
df_monthly.index = df_monthly["date"]
df_monthly["month"] = df_monthly["date"].apply(
lambda x: calendar.month_abbr[x.month]
)
df_monthly["wy"] = df_monthly["wy_date"].apply(lambda x: x.year)
df_monthly.sort_values(by="wy_date", axis=0, inplace=True)
df_monthly.drop(
columns=[i for i in df_monthly.columns if "date" in i], inplace=True
)
# df_monthly.index = df_monthly['month']
df_merge = pd.merge(df_monthly,df_monthly_prec,how='inner', left_index=True, right_index=True)
#---Subplots---------------------------------------------------------------
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(df_merge.index, df_merge["Median ('91-'20)"], color="green", linewidth="1", linestyle="dashed", label = 'Median Snowpack')
ax.plot(df_merge.index, df_merge[f'{curr_wy}'], color='red', linewidth='2',label='WY Current')
#------Seting x-axis range to expand bar width for ax2
ax.bar(df_merge.index,df_merge[basin[i]], color = 'blue', label = 'Monthly %')
#n = n + 1
#--format chart
ax.set_title(chart_name[w], fontweight = 'bold')
w = w + 1
ax.set_ylabel("Basin Precipitation Index")
ax.set_yticklabels([])
ax.margins(x=0)
ax.legend()
#plt.xlim(0,9)
#---Setting date format
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
#---EXPORT
plt.show()
End result desired: Plotting both the monthly dataframe (df_monthly_prec) with the daily dataframe charting only monthly values (df_monthly). The bars for the monthly DataFrame should ideally span the whole month on the chart.
I have tried creating a secondary axis, but had trouble aligning the times for the primary and secondary axes. Ideally, I would like to replace plotting df_monthly with df (showing all daily data instead of just the end-of-month values within the daily dataset).
Any assistance or pointers would be much appreciated! Apologies if additional clarification is needed.

What is Julia's equivalent ggplot code of R's?

I would like to plot a sophisticated graph in Julia. The code below is in Julia's version using ggplot.
using CairoMakie, DataFrames, Effects, GLM, StatsModels, StableRNGs, RCall
#rlibrary ggplot2
rng = StableRNG(42)
growthdata = DataFrame(; age=[13:20; 13:20],
sex=repeat(["male", "female"], inner=8),
weight=[range(100, 155; length=8); range(100, 125; length=8)] .+ randn(rng, 16))
mod_uncentered = lm(#formula(weight ~ 1 + sex * age), growthdata)
refgrid = copy(growthdata)
filter!(refgrid) do row
return mod(row.age, 2) == (row.sex == "male")
end
effects!(refgrid, mod_uncentered)
refgrid[!, :lower] = #. refgrid.weight - 1.96 * refgrid.err
refgrid[!, :upper] = #. refgrid.weight + 1.96 * refgrid.err
df= refgrid
ggplot(df, aes(x=:age, y=:weight, group = :sex, shape= :sex, linetype=:sex)) +
geom_point(position=position_dodge(width=0.15)) +
geom_ribbon(aes(ymin=:lower, ymax=:upper), fill="gray", alpha=0.5)+
geom_line(position=position_dodge(width=0.15)) +
ylab("Weight")+ xlab("Age")+
theme_classic()
However, I would like to modify this graph a bit more. For example, I would like to change the scale of the y axis, the colors of the ribbon, add some error bars, and also change the text size of the legend and so on. Since I am new to Julia, I am not succeding in finding the equivalent language code for these modifications. Could someone help me translate this R code below of ggplot into Julia's language?
t1= filter(df, sex=="male") %>% slice_max(df$weight)
ggplot(df, aes(age, weight, group = sex, shape= sex, linetype=sex,fill=sex, colour=sex)) +
geom_line(position=position_dodge(width=0.15)) +
geom_point(position=position_dodge(width=0.15)) +
geom_errorbar(aes(ymin = lower, ymax = upper),width = 0.1,
linetype = "solid",position=position_dodge(width=0.15))+
geom_ribbon(aes(ymin = lower, ymax = upper, fill = sex, colour = sex), alpha = 0.2) +
geom_text(data = t1, aes(age, weight, label = round(weight, 1)), hjust = -0.25, size=7,show_guide = FALSE) +
scale_y_continuous(limits = c(70, 150), breaks = seq(80, 140, by = 20))+
theme_classic()+
scale_colour_manual(values = c("orange", "blue")) +
guides(color = guide_legend(override.aes = list(linetype = c('dotted', 'dashed'))),
linetype = "none")+
xlab("Age")+ ylab("Average marginal effects") + ggtitle("Title") +
theme(
axis.title.y = element_text(color="Black", size=28, face="bold", hjust = 0.9),
axis.text.y = element_text(face="bold", color="black", size=16),
plot.title = element_text(hjust = 0.5, color="Black", size=28, face="bold"),
legend.title = element_text(color = "Black", size = 13),
legend.text = element_text(color = "Black", size = 16),
legend.position="bottom",
axis.text.x = element_text(face="bold", color="black", size=11),
strip.text = element_text(face= "bold", size=15)
)
As I commented before, you can use R-strings to run R code. To be clear, this isn't like your post's approach where you piece together many Julia objects that wrap many R objects, this is RCall converting a Julia Dataframe to an R dataframe then running your R code.
Running an R script may not seem very Julian, but code reuse is very Julian. Besides, you're still using an R library and active R session either way, and there might even be a slight performance benefit from reducing how often you make wrapper objects and switch between Julia and R.
## import libraries for Julia and R; still good to do at top
using CairoMakie, DataFrames, Effects, GLM, StatsModels, StableRNGs, RCall
R"""
library(ggplot2)
library(dplyr)
"""
## your Julia code without the #rlibrary or ggplot lines
rng = StableRNG(42)
growthdata = DataFrame(; age=[13:20; 13:20],
sex=repeat(["male", "female"], inner=8),
weight=[range(100, 155; length=8); range(100, 125; length=8)] .+ randn(rng, 16))
mod_uncentered = lm(#formula(weight ~ 1 + sex * age), growthdata)
refgrid = copy(growthdata)
filter!(refgrid) do row
return mod(row.age, 2) == (row.sex == "male")
end
effects!(refgrid, mod_uncentered)
refgrid[!, :lower] = #. refgrid.weight - 1.96 * refgrid.err
refgrid[!, :upper] = #. refgrid.weight + 1.96 * refgrid.err
df= refgrid
## convert Julia's df and run your R code in R-string
## - note that $df is interpolation of Julia's df into R-string,
## not R's $ operator like in rdf$weight
## - call the R dataframe rdf because df is already an R function
R"""
rdf <- $df
t1= filter(rdf, sex=="male") %>% slice_max(rdf$weight)
ggplot(rdf, aes(age, weight, group = sex, shape= sex, linetype=sex,fill=sex, colour=sex)) +
geom_line(position=position_dodge(width=0.15)) +
geom_point(position=position_dodge(width=0.15)) +
geom_errorbar(aes(ymin = lower, ymax = upper),width = 0.1,
linetype = "solid",position=position_dodge(width=0.15))+
geom_ribbon(aes(ymin = lower, ymax = upper, fill = sex, colour = sex), alpha = 0.2) +
geom_text(data = t1, aes(age, weight, label = round(weight, 1)), hjust = -0.25, size=7,show_guide = FALSE) +
scale_y_continuous(limits = c(70, 150), breaks = seq(80, 140, by = 20))+
theme_classic()+
scale_colour_manual(values = c("orange", "blue")) +
guides(color = guide_legend(override.aes = list(linetype = c('dotted', 'dashed'))),
linetype = "none")+
xlab("Age")+ ylab("Average marginal effects") + ggtitle("Title") +
theme(
axis.title.y = element_text(color="Black", size=28, face="bold", hjust = 0.9),
axis.text.y = element_text(face="bold", color="black", size=16),
plot.title = element_text(hjust = 0.5, color="Black", size=28, face="bold"),
legend.title = element_text(color = "Black", size = 13),
legend.text = element_text(color = "Black", size = 16),
legend.position="bottom",
axis.text.x = element_text(face="bold", color="black", size=11),
strip.text = element_text(face= "bold", size=15)
)
"""
The result is the same as your post's R code:
I used Vega-Lite (https://github.com/queryverse/VegaLite.jl) which is also grounded in the "Grammar of Graphics", and LinearRegression (https://github.com/ericqu/LinearRegression.jl) which provides similar features as GLM, although I think it is possible to get comparable results with the other plotting and linear regression packages. Nevertheless, I hope that this gives you a starting point.
using LinearRegression: Distributions, DataFrames, CategoricalArrays
using DataFrames, StatsModels, LinearRegression
using VegaLite
growthdata = DataFrame(; age=[13:20; 13:20],
sex=categorical(repeat(["male", "female"], inner=8), compress=true),
weight=[range(100, 155; length=8); range(100, 125; length=8)] .+ randn(16))
lm = regress(#formula(weight ~ 1 + sex * age), growthdata)
results = predict_in_sample(lm, growthdata, req_stats="all")
fp = select(results, [:age, :weight, :sex, :uclp, :lclp, :predicted]) |> #vlplot() +
#vlplot(
mark = :errorband, color = :sex,
y = { field = :uclp, type = :quantitative, title="Average marginal effects"},
y2 = { field = :lclp, type = :quantitative },
x = {:age, type = :quantitative} ) +
#vlplot(
mark = :line, color = :sex,
x = {:age, type = :quantitative},
y = {:predicted, type = :quantitative}) +
#vlplot(
:point, color=:sex ,
x = {:age, type = :quantitative, axis = {grid = false}, scale = {zero = false}},
y = {:weight, type = :quantitative, axis = {grid = false}, scale = {zero = false}},
title = "Title", width = 400 , height = 400
)
which gives:
You can change the style of the elements by changing the "config" as indicated here (https://www.queryverse.org/VegaLite.jl/stable/gettingstarted/tutorial/#Config-1).
As the Julia Vega-Lite is a wrapper to Vega-Lite additional documentation can be found on the Vega-lite website (https://vega.github.io/vega-lite/)

matplotlib - Plot 3 histograms with percentage on y axis

I am trying to have 3 histograms on same plot but against percentage on the y axis but not sure how.
You can see what I am trying to do, but I can't get there! The bars don't scale property and not sure how to do that! Please help
data_1 = [1,1,2,3,4,4,5,6,7,7,7,8,9]
data_2 = [4,2,3,4,1,1,6,8,9,9,9,5,6]
data_3 = [1,2,3,4,5,6,7,8,9,9,4,3,7,1,2,3,2,5,7,3,4,7,3,8,2,3,4,7,2]
bin = [1,10,1]
bin[1] += bin[2]*2
bin[0] -= bin[2]
bins_list = np.arange(bin[0],bin[1],bin[2])
fig, ax = plt.subplots(figsize=(15, 10))
counts, bins, patches = plt.hist((np.clip(data_1 , bins_list[0], bins_list[-1]),np.clip(data_2, bins_list[0], bins_list[-1]),np.clip(data_3, bins_list[0], bins_list[-1])),
rwidth=0.8,
bins=bins_list, color=('blue', 'red', 'green'),
weights=None)
xlabels = bins[1:].astype(str)
xlabels[-1] = xlabels[-2] + '>'
xlabels[0] = xlabels[0] + '<'
for lb in range(1, len(xlabels)-1):
xlabels[lb] = str(bins_list[lb])+'-'+xlabels[lb]
N_labels = len(xlabels)
plt.xticks(bin[2] * np.arange(N_labels)+bin[2]/2 + bin[0], rotation=300)
ax.set_xticklabels(xlabels)
total = 0
''' Add percentages and value for each bar '''
for c in range(len(patches)):
for count, p in zip(counts[c], patches[c]):
percentage = '%0.2f%% ' % (100 * float(count) / counts[c].sum())
total += 100 * float(count) / counts[c].sum()
x1 = p.get_x()
y1 = p.get_y() + p.get_height()
ax.annotate(percentage, (x1, y1), rotation=270, fontsize = 10)
ax.yaxis.set_major_formatter(tkr.PercentFormatter(xmax=len(data_3)))
ax.grid(axis='y', color='r', linestyle=':')
ax.set_title("Please help")
ax.set_axisbelow(True)
fig.tight_layout()
Plot Result of code above
You can pass density=True on plt.hist:
counts, bins, patches = plt.hist((np.clip(data_1 , bins_list[0], bins_list[-1]),
np.clip(data_2, bins_list[0], bins_list[-1]),
np.clip(data_3, bins_list[0], bins_list[-1])),
rwidth=0.8,
density=True, # here
bins=bins_list, color=('blue', 'red', 'green'),
weights=None)
Output:

How can the edge colors of individual matplotlib histograms be set?

I've got a rough and ready function that can be used to compare two sets of values using histograms:
I want to set the individual edge colors of each of the histograms in the top plot (much as how I set the individual sets of values used for each histogram). How could this be done?
import os
import datavision
import matplotlib.pyplot
import numpy
import shijian
def main():
a = numpy.random.normal(2, 2, size = 120)
b = numpy.random.normal(2, 2, size = 120)
save_histogram_comparison_matplotlib(
values_1 = a,
values_2 = b,
label_1 = "a",
label_2 = "b",
normalize = True,
label_ratio_x = "measurement",
label_y = "",
title = "comparison of a and b",
filename = "histogram_comparison_1.png"
)
def save_histogram_comparison_matplotlib(
values_1 = None,
values_2 = None,
filename = None,
directory = ".",
number_of_bins = None,
normalize = True,
label_x = "",
label_y = None,
label_ratio_x = None,
label_ratio_y = "ratio",
title = "comparison",
label_1 = "1",
label_2 = "2",
overwrite = True,
LaTeX = False,
#aspect = None,
font_size = 20,
color_1 = "#3861AA",
color_2 = "#00FF00",
color_3 = "#7FDADC",
color_edge_1 = "#3861AA", # |<---------- insert magic for these
color_edge_2 = "#00FF00", # |
alpha = 0.5,
width_line = 1
):
matplotlib.pyplot.ioff()
if LaTeX is True:
matplotlib.pyplot.rc("text", usetex = True)
matplotlib.pyplot.rc("font", family = "serif")
if number_of_bins is None:
number_of_bins_1 = datavision.propose_number_of_bins(values_1)
number_of_bins_2 = datavision.propose_number_of_bins(values_2)
number_of_bins = int((number_of_bins_1 + number_of_bins_2) / 2)
if filename is None:
if title is None:
filename = "histogram_comparison.png"
else:
filename = shijian.propose_filename(
filename = title + ".png",
overwrite = overwrite
)
else:
filename = shijian.propose_filename(
filename = filename,
overwrite = overwrite
)
values = []
values.append(values_1)
values.append(values_2)
bar_width = 0.8
figure, (axis_1, axis_2) = matplotlib.pyplot.subplots(
nrows = 2,
gridspec_kw = {"height_ratios": (2, 1)}
)
ns, bins, patches = axis_1.hist(
values,
color = [
color_1,
color_2
],
normed = normalize,
histtype = "stepfilled",
bins = number_of_bins,
alpha = alpha,
label = [label_1, label_2],
rwidth = bar_width,
linewidth = width_line,
#edgecolor = [color_edge_1, color_edge_2] <---------- magic here? dunno
)
axis_1.legend(
loc = "best"
)
bars = axis_2.bar(
bins[:-1],
ns[0] / ns[1],
alpha = 1,
linewidth = 0, #width_line
width = bins[1] - bins[0]
)
for bar in bars:
bar.set_color(color_3)
axis_1.set_xlabel(label_x, fontsize = font_size)
axis_1.set_ylabel(label_y, fontsize = font_size)
axis_2.set_xlabel(label_ratio_x, fontsize = font_size)
axis_2.set_ylabel(label_ratio_y, fontsize = font_size)
#axis_1.xticks(fontsize = font_size)
#axis_1.yticks(fontsize = font_size)
#axis_2.xticks(fontsize = font_size)
#axis_2.yticks(fontsize = font_size)
matplotlib.pyplot.suptitle(title, fontsize = font_size)
if not os.path.exists(directory):
os.makedirs(directory)
#if aspect is None:
# matplotlib.pyplot.axes().set_aspect(
# 1 / matplotlib.pyplot.axes().get_data_ratio()
# )
#else:
# matplotlib.pyplot.axes().set_aspect(aspect)
figure.tight_layout()
matplotlib.pyplot.subplots_adjust(top = 0.9)
matplotlib.pyplot.savefig(
directory + "/" + filename,
dpi = 700
)
matplotlib.pyplot.close()
if __name__ == "__main__":
main()
You may simply plot two different histograms but share the bins.
import numpy as np; np.random.seed(3)
import matplotlib.pyplot as plt
a = np.random.normal(size=(89,2))
kws = dict(histtype= "stepfilled",alpha= 0.5, linewidth = 2)
hist, edges,_ = plt.hist(a[:,0], bins = 6,color="lightseagreen", label = "A", edgecolor="k", **kws)
plt.hist(a[:,1], bins = edges,color="gold", label = "B", edgecolor="crimson", **kws)
plt.show()
Use the lists of Patches objects returned by the hist() function.
In your case, you have two datasets, so your variable patches will be a list containing two lists, each with the Patches objects used to draw the bars on your plot.
You can easily set the properties on all of these objects using the setp() function. For example:
a = np.random.normal(size=(100,))
b = np.random.normal(size=(100,))
c,d,e = plt.hist([a,b], color=['r','g'])
plt.setp(e[0], edgecolor='k', lw=2)
plt.setp(e[1], edgecolor='b', lw=3)

ggplot2: How to move y axis labels right next to the bars

I am working with following reproducible dataset:
df<- data.frame(name=c(letters[1:10],letters[1:10]),fc=runif(20,-5,5)
,fdr=runif(20),group=c(rep("gene",10),rep("protein",10)))
Code used to plot:
df$sig<- ifelse(df$fdr<0.05 & df$fdr>0 ,"*","")
ggplot(df, aes(x=reorder(name,fc),fc))+geom_col(aes(fill=group),position = "dodge",width = 0.9)+
coord_flip()+
geom_text(aes(label = sig),angle = 90, position = position_stack(vjust = -0.2), color= "black",size=3)+
scale_y_continuous(position = "right")+
scale_fill_manual(values = c("gene"= "#FF002B","protein"="blue"))+
geom_hline(yintercept = 0, colour = "gray" )+
theme(legend.position="none", axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.text.y=element_text(),
axis.line=element_line(color="gray"),axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
Resulting in following plot:
Instead of having the y-axis labels on left side, I would like to place them right next to the bars. I want to emulate this chart published in nature:
https://www.nature.com/articles/ncomms2112/figures/3
Like this?
df<- data.frame(name=c(letters[1:10],letters[1:10]),fc=runif(20,-5,5)
,fdr=runif(20),group=c(rep("gene",10),rep("protein",10)))
df$sig<- ifelse(df$fdr<0.05 & df$fdr>0 ,"*","")
df$try<-c(1:10,1:10) #assign numbers to letters
x_pos<-ifelse(df$group=='gene',df$try-.2,df$try+.2) #align letters over bars
y_posneg<-ifelse(df$fc>0,df$fc+.5,df$fc-.5) #set up y axis position of letters
ggplot(df, aes(x=try,fc))+geom_col(aes(fill=group),position = "dodge",width = 0.9)+
coord_flip()+
geom_text(aes(y=y_posneg,x=x_pos,label = name),color= "black",size=6)+
scale_y_continuous(position = "right")+
scale_fill_manual(values = c("gene"= "#FF002B","protein"="blue"))+
geom_hline(yintercept = 0, colour = "gray" )+
theme(legend.position="none", axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.text.y=element_blank(),
axis.line=element_line(color="gray"),axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
Gives:
Or perhaps this?
x_pos<-ifelse(df$group=='gene',df$try-.2,df$try+.2) #align letters over bars
y_pos<-ifelse(df$fc>0,-.2,.2) #set up y axis position of letters
ggplot(df, aes(x=try,fc))+geom_col(aes(fill=group),position = "dodge",width = 0.9)+
coord_flip()+
geom_text(aes(y=y_pos,x=x_pos,label = name),color= "black",size=3)+
scale_y_continuous(position = "right")+
scale_fill_manual(values = c("gene"= "#FF002B","protein"="blue"))+
geom_hline(yintercept = 0, colour = "gray" )+
theme(legend.position="none", axis.title.y=element_blank(),
axis.title.x=element_blank(),
axis.text.y=element_blank(),
axis.line=element_line(color="gray"),axis.line.y=element_blank(),
axis.ticks.y=element_blank(),
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
Gives: