I am trying to apply a simple value to a column to my pandas frame, but always shows NaN, i cant find the reason why.
here is my code.
def get_extra_hours(value):
return f'{value[12] -40: .2f}'
raw_data = pd.read_csv('testdata.csv')
unified = raw_data.groupby('Employee').sum()
unified['Hourly Rate'] = raw_data.groupby('Employee').first()['Hourly Rate']
unified['Extra Hours'] = raw_data.apply(get_extra_hours, axis=1)
print(unified.to_string())
the data in value[12] is a float, i just need take out 40 from value[12] and return with 2 decimal. it can be float or string.
I make it work, still don't understand why didnt work before but here is how i did it
def get_extra_hours(value):
x = value['Total Hours'] - 40
if x > 0:
return x
URL = f'https://api.mytimestation.com/v0.1/reports/?api_key={API_KEY}&Report_StartDate={date}&id={CODE}&exportformat=csv'
raw_data = pd.read_csv('testdata.csv')
unified = raw_data.groupby('Employee').sum()
unified['Hourly Rate'] = raw_data.groupby('Employee').first()['Hourly Rate']
unified['Extra Hours'] = unified.apply(get_extra_hours, axis=1)
print(unified.to_string())
i change the unified['Extra Hours'] = unified.apply(get_estra_hours, axis=1)
and also change the function get_extra_hours().
Related
I found an interesting snippet (vrana95) that caps multiple columns, however this function works on the main "df" as well instead to work only on "final_df". Someone knows why?
def cap_data(df):
for col in df.columns:
print("capping the ",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)
As I wanted to cap only a few columns I changed the for loop of the original snippet. It works, but I would to know why this function is working with both dataframes.
cols = ['score_3', 'score_6', 'credit_limit', 'last_amount_borrowed', 'reported_income', 'income']
def cap_data(df):
for col in cols:
print("capping the column:",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)
I need to find the arithmetic mean of each columns by returning res?
def ave(df, name):
df = {
'Courses':["Spark","PySpark","Python","pandas",None],
'Fee' :[20000,25000,22000,None,30000],
'Duration':['30days','40days','35days','None','50days'],
'Discount':[1000,2300,1200,2000,None]}
#CODE HERE
res = []
for i in df.columns:
res.append(col_ave(df, i))
I tried individually creating codes for the mean but Im having trouble
I'm trying to do some calculations involving a pandas series as shown below. Basically first I extracted t from a DataFrame column and then used a for loop with "if...else..." to do further calculation, because I found out that when I used max(f_min, nan), f_min was always returned. The code below worked, but it looks rather cumbersome. Is there a better way to do what I wanted to do here? Thank you so much for your help!
f_min = 0.1
t_min=0. #degree C
t_max=35.
t_opt=21.
t=pd.Series([nan, nan, nan, 37., 31., 23.],
index=['08/22/2011 07','08/22/2011 08','08/22/2011 09',
'08/22/2011 10','08/22/2011 11','08/22/2011 12'],
name='T')
# t=df.T
a = (t - t_min)/(t_opt - t_min)
bt = (t_max - t_opt)/(t_opt - t_min)
b = ((t_max - t)/(t_max - t_opt))**bt
d = a * b
i= 0
for x in d:
if (pd.isna(x)):
d.iloc[i] = np.nan
else:
f_temp = max (f_min, x)
d.iloc[i] = f_temp
i = i+1
Let's use either:
d.clip(f_min,)
or
d.loc[d<f_min] = f_min
I have pandas data frame which gets created by reading an excel file. The excel file has a column called serial number. Then I pass a serial number to another function which connect to API and fetch me the result set for those serial number.
My Code -:
def create_excel(filename):
try:
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28],converters={'Serial Number': '{:0>32}'.format})
except Exception as e:
sys.exit("Error reading %s: %s" % (filename, e))
data["Subject Organization"].fillna("N/A",inplace= True)
df = data[data['Subject Organization'].str.contains("Fannie",case = False)]
#df['Serial Number'].apply(lamda x: '000'+x if len(x) == 29 else '00'+x if len(x) == 30 else '0'+x if len(x) == 31 else x)
print(df)
df.to_excel(r'Data.xlsx',index= False)
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
The problem I am facing is I want to check if df5 returned by fetch_by_ser_no() is blank then make the serial number as 34 characters by adding two more leading 00 and then check the function again.
How can I do it by not creating multiple dataframe
Any help!!
Thanks
You can try to use if ... else ...:
output = df['Serial Number'].apply(lambda x: 'ok' if fetch_by_ser_no(x) else 'badly')
Flowing on from this question, which i link as background, but question is standalone.
4 questions:
I cannot understand the error I see when using apply or transform:
"invalid dtype determination in get_concat_dtype"
Why does ClipNetMean work but the other 2 methods not?
Unsure if or why i need the .copy(deep=True)
Why the slightly different syntax needed to call the InnerFoo function
The DataFrame:
cost
section item
11 1 25
2 100
3 77
4 10
12 5 50
1 39
2 7
3 32
13 4 19
1 21
2 27
The code:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = {'section' : [11,11,11,11,12,12,12,12,13,13,13]
,'item' : [1,2,3,4,5,1,2,3,4,1,2]
,'cost' : [25.,100.,77.,10.,50.,39.,7.,32.,19.,21.,27.]
})
df.set_index(['section','item'],inplace=True)
upper =50
lower = 10
def ClipAndNetMean(cost,upper,lower):
avg = cost.mean()
new_cost = (cost- avg).clip(lower,upper)
return new_cost
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
return v.to_frame()
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
return v.to_frame()
return inner
#These 2 work fine.
print df.groupby(level = 'section').apply(ClipAndNetMean,lower,upper)
print df.groupby(level = 'section').transform(ClipAndNetMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(MiniMean,lower,upper)
print df.groupby(level = 'section').transform(MiniMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(InnerFoo(lower,upper))
print df.groupby(level = 'section').transform(InnerFoo(lower,upper))
exit()
So to Chris's answer, note that if I add back the column header the methods will work in a Transform call.
see v.columns = ['cost']
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
return inner
1 & 2) transform expects something "like-indexed", while apply is flexible. The two failing functions are adding additional columns.
3) In some cases, (e.g. if you're passing a whole DataFrame into a function) it can be necessary to copy to avoid mutating the original. It should not be necessary here.
4) The first two functions take a DataFrame with two parameters and returns data. InnerFoo actually returns another function, so it needs to be called before being passed into apply.