for i in range(len(basin)):
prefix = "URL here"
state = "OR"
basin_name = basin[i]
df_orig = pd.read_csv(f"{prefix}/{basin_name}.csv", index_col=0)
#---create date x-index
curr_wy_date_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 9, 30),
freq="D",
)
if not calendar.isleap(curr_wy):
print("dropping leap day")
df_orig.drop(["02-29"], inplace=True)
use_cols = ["Median ('91-'20)", f"{curr_wy}"]
df = pd.DataFrame(data=df_orig[use_cols].copy())
df.index = curr_wy_date_rng
#--create EOM percent of median values-------------------------------------
curr_wy_month_rng = pd.date_range(
start=dt(curr_wy-1, 10, 1),
end=dt(curr_wy, 6, 30),
freq="M",
)
df_monthly_prec = pd.DataFrame(data=df_monthly_basin[basin[i]].copy())
df_monthly_prec.index = curr_wy_month_rng
df_monthly = df.groupby(pd.Grouper(freq="M")).max()
df_monthly["date"] = df_monthly.index
df_monthly["wy_date"] = df_monthly["date"].apply(lambda x: cal_to_wy(x))
df_monthly.index = pd.to_datetime(df_monthly["wy_date"])
df_monthly.index = df_monthly["date"]
df_monthly["month"] = df_monthly["date"].apply(
lambda x: calendar.month_abbr[x.month]
)
df_monthly["wy"] = df_monthly["wy_date"].apply(lambda x: x.year)
df_monthly.sort_values(by="wy_date", axis=0, inplace=True)
df_monthly.drop(
columns=[i for i in df_monthly.columns if "date" in i], inplace=True
)
# df_monthly.index = df_monthly['month']
df_merge = pd.merge(df_monthly,df_monthly_prec,how='inner', left_index=True, right_index=True)
#---Subplots---------------------------------------------------------------
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(df_merge.index, df_merge["Median ('91-'20)"], color="green", linewidth="1", linestyle="dashed", label = 'Median Snowpack')
ax.plot(df_merge.index, df_merge[f'{curr_wy}'], color='red', linewidth='2',label='WY Current')
#------Seting x-axis range to expand bar width for ax2
ax.bar(df_merge.index,df_merge[basin[i]], color = 'blue', label = 'Monthly %')
#n = n + 1
#--format chart
ax.set_title(chart_name[w], fontweight = 'bold')
w = w + 1
ax.set_ylabel("Basin Precipitation Index")
ax.set_yticklabels([])
ax.margins(x=0)
ax.legend()
#plt.xlim(0,9)
#---Setting date format
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
#---EXPORT
plt.show()
End result desired: Plotting both the monthly dataframe (df_monthly_prec) with the daily dataframe charting only monthly values (df_monthly). The bars for the monthly DataFrame should ideally span the whole month on the chart.
I have tried creating a secondary axis, but had trouble aligning the times for the primary and secondary axes. Ideally, I would like to replace plotting df_monthly with df (showing all daily data instead of just the end-of-month values within the daily dataset).
Any assistance or pointers would be much appreciated! Apologies if additional clarification is needed.
I have some discrete data in an array, such that:
arr = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
whose plot looks like:
I also have an index array, such that each unique value in arr is associated with a unique index value, like:
ind = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
What is the most pythonic way of converting arr from discrete values to continuous values, so that the array would look like this when plotted?:
therefore, interpolating between the discrete points to make continuous data
I found a solution to this if anyone has a similar issue. It is maybe not the most elegant so modifications are welcome:
def ref_linear_interp(x, y):
arr = []
ux=np.unique(x) #unique x values
for u in ux:
idx = y[x==u]
try:
min = y[x==u-1][0]
max = y[x==u][0]
except:
min = y[x==u][0]
max = y[x==u][0]
try:
min = y[x==u][0]
max = y[x==u+1][0]
except:
min = y[x==u][0]
max = y[x==u][0]
if min==max:
sub = np.full((len(idx)), min)
arr.append(sub)
else:
sub = np.linspace(min, max, len(idx))
arr.append(sub)
return np.concatenate(arr, axis=None).ravel()
y = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
x = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
z = np.arange(1, 16, 1)
Here is an answer for the symmetric solution that I would expect when reading the question:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# create the data as described
numbers = [1,2,3,2,1]
nblock = 3
df = pd.DataFrame({
"x": np.arange(nblock*len(numbers)),
"y": np.repeat(numbers, nblock),
"label": np.repeat(np.arange(len(numbers)), nblock)
})
Expecting a constant block size of 3, we could use a rolling window:
df['y-smooth'] = df['y'].rolling(nblock, center=True).mean()
# fill NaNs
df['y-smooth'].bfill(inplace=True)
df['y-smooth'].ffill(inplace=True)
plt.plot(df['x'], df['y-smooth'], marker='*')
If the block size is allowed to vary, we could determine the block centers and interpolate piecewise.
centers = df[['x', 'y', 'label']].groupby('label').mean()
df['y-interp'] = np.interp(df['x'], centers['x'], centers['y'])
plt.plot(df['x'], df['y-interp'], marker='*')
Note: You may also try
centers = df[['x', 'y', 'label']].groupby('label').min() to select the left corner of the labelled blocks.
In the dataframe below
I want to write a def function that takes in a data frame and does the following :
select Location , Group, Income_Yr1 : Income_Yr3 columns
sort the dataframe using the Group column from lowest to the highest value
create a pivot table for mean,median and standard deviation ( making it 3 tables per Income_Yr or possibly combine them as one ) for Income_Yr1, Income_Yr2 and Income_Yr3
# DataFrame using arrays.
import pandas as pd
import numpy as np
# initialise data of lists.
data = {'Gender':['F', 'F', 'M', 'F','M', 'F', 'M', 'M','F', 'F', 'M', 'F','M', 'F', 'M', 'M','M','F', 'F', 'M'],
'UID':[1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020],
'Location':['PHX','PHX','PHX','PHX','ATL','ATL','ATL','ATL','HOU','HOU','HOU','MIA','MIA','MIA','MIA','MIA','DEN','DEN','DEN','DEN'],
'Group':[3,3,3,3,4,4,4,4,1,1,1,1,2,2,2,2,5,5,5,5],
'Income_Yr1':[32112,34214,45575,22106,32612,34216,47515,22906,32112,34511,45525,12106,52112,54214,45015,22986,32112,34214,47518,22175],
'Income_Yr2':[52112,54215,65515,72109,52616,64217,77515,52906,52145,38512,65516,32157,63152,57218,51017,42997,38125,36253,49589,32598],
'Income_Yr3':[52143,54239,65557,72116,52660,64273,77551,52969,52500,38201,65169,32795,63288,57180,51173,42970,38205,36301,59591,32580]}
df = pd.DataFrame(data)
See below for my attempt, I am open to other approaches
# read in the dataset
def pivot_table (data):
#1. import dataset and select the desired columns, I want to include all column names with string 'Income'
df1 = df[['Group','Location','Income_Yr1':'Income_Yr3']]
#2 sort the data using 'Group' column
df1 = df1.sort_values('Group')
#3a create pivot table for mean
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr1',columns = 'Location',margins = True)
#3b create pivot table for median
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr1',columns = 'Location',aggfunc = 'median', margins = True)
#3c create pivot table for std
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr1',columns = 'Location',aggfunc = np.std, margins = True)
#3d Income_Yr2: create pivot table for mean
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr2',columns = 'Location',margins = True)
#3e Income_Yr2: create pivot table for median
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr2',columns = 'Location',aggfunc = 'median', margins = True)
#3f Income_Yr2 create pivot table for std
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr2',columns = 'Location',aggfunc = np.std, margins = True)
#3g Income_Yr3: create pivot table for mean
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr3',columns = 'Location',margins = True)
#3h Income_Yr3: create pivot table for median
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr3',columns = 'Location',aggfunc = 'median', margins = True)
#3i Income_Yr3 create pivot table for std
pd.pivot_table(df1,index = ['Group','Location'],values ='Income_Yr3',columns = 'Location',aggfunc = np.std, margins = True)
##########
#test code
pivot_table(df)
Thanks
Let's use melt then groupby
We can use a dictionary comprehension then to sort and split your dataframes.
df1 = pd.melt(df,
id_vars=['Group','Location'],
value_vars=df.filter(like='Income').columns.tolist()
).sort_values('value') # default is lowest to highest.
df2 = df1.groupby(['Group','Location','variable'])['value'].agg(['mean','median','std'])
#now for your split dataframes.
out = {income_yr : frame for income_yr, frame in df2.groupby(level=-1)}
print(out['Income_Yr2'])
mean median std
Group Location variable
1 HOU Income_Yr2 52057.666667 52145.0 13502.211831
MIA Income_Yr2 32157.000000 32157.0 NaN
2 MIA Income_Yr2 53596.000000 54117.5 8629.910428
3 PHX Income_Yr2 60987.750000 59865.0 9466.207878
4 ATL Income_Yr2 61813.500000 58561.5 11779.239888
5 DEN Income_Yr2 39141.250000 37189.0 7333.583770
Function
Not sure of the benefit of a function here unless you have very complicated data pipelines or need to re-use this piece of code in many places but this should work,
import pandas as pd
from typing import Dict
def transform_and_split_data(data: pd.DataFrame) -> Dict[str,pd.DataFrame]:
df1 = pd.melt(data,
id_vars=['Group','Location'],
value_vars=data.filter(like='Income').columns.tolist()
).sort_values('value') # default is lowest to highest.
df2 = df1.groupby(['Group','Location','variable'])['value'].agg(['mean','median','std'])
return {income_yr : frame for income_yr, frame in df2.groupby(level=-1)}
I'm using a for loop to plot all of the features in my dataset. I want it to skip plotting any attributes that have a datetime type. It doesn't seem to skipping correctly.....what do I need to fix?
(JFYI, I have confirmed with df.dtypes that the columns appear as datetime64[ns])
def plot_distribution(dataset, cols=5, width=20, height=50, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
g = sns.countplot(y=column, hue = target_column, data = df)
if df.dtypes[column] == np.datetime64:
continue
plot_distribution(df, cols=1, width=20, height=500, hspace=0.8, wspace=0.5)