Capping multiples columns - pandas

I found an interesting snippet (vrana95) that caps multiple columns, however this function works on the main "df" as well instead to work only on "final_df". Someone knows why?
def cap_data(df):
for col in df.columns:
print("capping the ",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)
As I wanted to cap only a few columns I changed the for loop of the original snippet. It works, but I would to know why this function is working with both dataframes.
cols = ['score_3', 'score_6', 'credit_limit', 'last_amount_borrowed', 'reported_income', 'income']
def cap_data(df):
for col in cols:
print("capping the column:",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)

Related

Fetch data from several similarly named tables in DAE Databricks?

I'm using PySpark in DAE Databricks to get HES data.
At the moment I do this:
df_test = sqlContext.sql("select * from db_name_2122")
ICD10_codes = ['X85','X87']
df_test = df_test.filter( (df_test.field1 == "something") &
(df_test.field.rlike('|'.join(ICD10_codes) )
df_test_2 = sqlContext.sql("select * from db_name_2021")
ICD10_codes = ['X85','X87']
df_test2 = df_test2.filter( (df_test2.field1 == "something") &
(df_test2.field.rlike('|'.join(ICD10_codes) )
I have to do this for financial years 1112, 1213, 1314, ..., 2122. This is a lot of copy-pasting of similar code and I know this is bad - both from experience of finding c+p errors and also reading stuff.
What I want to do:
Be able to select data where the same conditions are met in the same fields in 11 different financial year tables within a DB and pull it all into one table at the end.
Rather than what I'm doing now which is 11 different but similar copy and paste chunks of code, which are then appended together.
First, before the "appending" you can put all the dfs into the same list:
dfs = []
for i in range(11, 22):
df = sqlContext.sql(f"select * from db_name_{i}{i+1}")
ICD10_codes = ['X85','X87']
df = df.filter((df.field1 == "something") &
(df.field.rlike('|'.join(ICD10_codes))))
dfs.append(df)
And then do the "append". I don't know what do you mean by "append". This is how you could do a unionByName:
df_final = dfs[0]
for df in dfs[1:]:
df_final = df_final.unionByName(df)
Everything can be added into one loop:
ICD10_codes = ['X85','X87']
rng = range(11, 22)
for i in rng:
df = sqlContext.sql(f"select * from db_name_{i}{i+1}")
df = df.filter((df.field1 == "something") &
(df.field.rlike('|'.join(ICD10_codes))))
df_final = df if i == rng[0] else df_final.unionByName(df)
If your table names are more complex, you can put them directly into the list:
tables = ['db_name_1112_text1', 'db_name_1213_text56']
ICD10_codes = ['X85','X87']
for x in tables:
df = sqlContext.sql(f"select * from {x}")
df = df.filter((df.field1 == "something") &
(df.field.rlike('|'.join(ICD10_codes))))
df_final = df if x == tables[0] else df_final.unionByName(df)

Dataframe column filter from a list of tuples

I'm trying to create a function to filter a dataframe from a list of tuples. I've created the below function but it doesn't seem to be working.
The list of tuples would be have dataframe column name, and a min value and a max value to filter.
eg:
eg_tuple = [('colname1', 10, 20), ('colname2', 30, 40), ('colname3', 50, 60)]
My attempted function is below:
def col_cut(df, cutoffs):
for c in cutoffs:
df_filter = df[ (df[c[0]] >= c[1]) & (df[c[0]] <= c[2])]
return df_filter
Note that the function should not filter on rows where the value is equal to max or min. Appreciate the help.
The problem is that you each time take df as the source to filter. You should filter with:
def col_cut(df, cutoffs):
df_filter = df
for col, mn, mx in cutoffs:
dfcol = df_filter[col]
df_filter = df_filter[(dfcol >= mn) & (dfcol <= mx)]
return df_filter
Note that you can use .between(..) [pandas-doc] here:
def col_cut(df, cutoffs):
df_filter = df
for col, mn, mx in cutoffs:
df_filter = df_filter[df_filter[col].between(mn, mx)]
return df_filter
Use np.logical_and + reduce of all masks created by list comprehension with Series.between:
def col_cut(df, cutoffs):
mask = np.logical_and.reduce([df[col].between(min1,max1) for col,min1,max1 in cutoffs])
return df[mask]

add Value to a column From a .apply function

I am trying to apply a simple value to a column to my pandas frame, but always shows NaN, i cant find the reason why.
here is my code.
def get_extra_hours(value):
return f'{value[12] -40: .2f}'
raw_data = pd.read_csv('testdata.csv')
unified = raw_data.groupby('Employee').sum()
unified['Hourly Rate'] = raw_data.groupby('Employee').first()['Hourly Rate']
unified['Extra Hours'] = raw_data.apply(get_extra_hours, axis=1)
print(unified.to_string())
the data in value[12] is a float, i just need take out 40 from value[12] and return with 2 decimal. it can be float or string.
I make it work, still don't understand why didnt work before but here is how i did it
def get_extra_hours(value):
x = value['Total Hours'] - 40
if x > 0:
return x
URL = f'https://api.mytimestation.com/v0.1/reports/?api_key={API_KEY}&Report_StartDate={date}&id={CODE}&exportformat=csv'
raw_data = pd.read_csv('testdata.csv')
unified = raw_data.groupby('Employee').sum()
unified['Hourly Rate'] = raw_data.groupby('Employee').first()['Hourly Rate']
unified['Extra Hours'] = unified.apply(get_extra_hours, axis=1)
print(unified.to_string())
i change the unified['Extra Hours'] = unified.apply(get_estra_hours, axis=1)
and also change the function get_extra_hours().

Time Difference between Time Period and Instant

I have some time periods (df_A) and some time instants (df_B):
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
# Data
df_A = pd.DataFrame({'A1': [dt.datetime(2017,1,5,9,8), dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,2,7,9,19), dt.datetime(2017,2,7,9,19)],
'A2': [dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,5,9,12), dt.datetime(2017,1,7,9,26), dt.datetime(2017,1,7,9,20), dt.datetime(2017,1,7,9,21), dt.datetime(2017,2,7,9,23), dt.datetime(2017,2,7,9,25)]})
df_B = pd.DataFrame({ 'B': [dt.datetime(2017,1,6,14,45), dt.datetime(2017,1,4,3,31), dt.datetime(2017,1,7,3,31), dt.datetime(2017,1,7,14,57), dt.datetime(2017,1,9,14,57)]})
I can match these together:
# Define an Extra Margin
M = dt.timedelta(days = 10)
df_A["A1X"] = df_A["A1"] + M
df_A["A2X"] = df_A["A2"] - M
# Match
Bv = df_B .B .values
A1 = df_A .A1X.values
A2 = df_A .A2X.values
i, j = np.where((Bv[:, None] >= A1) & (Bv[:, None] <= A2))
df_C = pd.DataFrame(np.column_stack([df_B .values[i], df_A .values[j]]),
columns = df_B .columns .append (df_A.columns))
I would like to find the time difference between each time period and the time instant matched to it. I mean that
if B is between A1 and A2
then dT = 0
I've tried doing it like this:
# Calculate dt
def time(A1,A2,B):
if df_C["B"] < df_C["A1"]:
return df_C["A1"].subtract(df_C["B"])
elif df_C["B"] > df_C["A2"]:
return df_C["B"].subtract(df_C["A2"])
else:
return 0
df_C['dt'] = df_C.apply(time)
I'm getting "ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series"
So, I found two fixes:
You are adding M to the lower value and subtracting from the higher one. Change it to:
df_A['A1X'] = df_A['A1'] - M
df_A['A2X'] = df_A['A2'] + M
You are only passing one row of your dataframe at a time to your time function, so it should be something like:
def time(row):
if row['B'] < row['A1']:
return row['A1'] - row['B']
elif row['B'] > row['A2']:
return row['B'] - row['A2']
else:
return 0
And then you can call it like this:
df_C['dt'] = df_C.apply(time, axis=1) :)

Pandas Apply(), Transform() ERROR = invalid dtype determination in get_concat_dtype

Flowing on from this question, which i link as background, but question is standalone.
4 questions:
I cannot understand the error I see when using apply or transform:
"invalid dtype determination in get_concat_dtype"
Why does ClipNetMean work but the other 2 methods not?
Unsure if or why i need the .copy(deep=True)
Why the slightly different syntax needed to call the InnerFoo function
The DataFrame:
cost
section item
11 1 25
2 100
3 77
4 10
12 5 50
1 39
2 7
3 32
13 4 19
1 21
2 27
The code:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = {'section' : [11,11,11,11,12,12,12,12,13,13,13]
,'item' : [1,2,3,4,5,1,2,3,4,1,2]
,'cost' : [25.,100.,77.,10.,50.,39.,7.,32.,19.,21.,27.]
})
df.set_index(['section','item'],inplace=True)
upper =50
lower = 10
def ClipAndNetMean(cost,upper,lower):
avg = cost.mean()
new_cost = (cost- avg).clip(lower,upper)
return new_cost
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
return v.to_frame()
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
return v.to_frame()
return inner
#These 2 work fine.
print df.groupby(level = 'section').apply(ClipAndNetMean,lower,upper)
print df.groupby(level = 'section').transform(ClipAndNetMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(MiniMean,lower,upper)
print df.groupby(level = 'section').transform(MiniMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(InnerFoo(lower,upper))
print df.groupby(level = 'section').transform(InnerFoo(lower,upper))
exit()
So to Chris's answer, note that if I add back the column header the methods will work in a Transform call.
see v.columns = ['cost']
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
return inner
1 & 2) transform expects something "like-indexed", while apply is flexible. The two failing functions are adding additional columns.
3) In some cases, (e.g. if you're passing a whole DataFrame into a function) it can be necessary to copy to avoid mutating the original. It should not be necessary here.
4) The first two functions take a DataFrame with two parameters and returns data. InnerFoo actually returns another function, so it needs to be called before being passed into apply.