Apply functions to multiple columns with pandas - pandas

I have 2 functions like this one:
def wind_index(result):
if result > 10:
return 1
elif (result > 0) & (result <= 5):
return 1.5
elif (result > 5) & (result <= 10):
return 2
def get_thermal_index(temp, hum):
return wind_index(temp - 0.4*(temp-10)*((1-hum)/100))
When I'm trying to apply this function like this:
df['tci'] = get_thermal_index(df['tempC'], df['humidity'])
I got this error: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
What else can I do to get a new column for my DataFrame using those functions??

You can use Series.apply:
def get_thermal_index(temp, hum):
return (temp - 0.4*(temp-10)*((1-hum)/100)).apply(wind_index)

Related

Calculate Average True Range directly with Dataframe

I wonder if there is a simple and direct way to calculate ATR from DataFrame object. I am stuck in the max() part. This is what I am trying to do:
df['atr']=max( (df['High']-df['Low']), (df['High']-df['Close'].shift()).abs(), (df['Low']-df['close'].shift()).abs() )
The above code gives this error:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
I understand that to use max() in this context is not appropriate for the dataframe object. But if it works this would be rather elegant and simple. Just wonder if there are built in functions within dataframe object to achieve this.
Following your approach:
np.max( ((df['High']-df['Low']).values, np.abs(df['High']-df['Close'].shift()), np.abs(df['Low']-df['Close'].shift())) , axis=0)
A function can be this (no pandas copy warning):
def ATR(data: pd.DataFrame, window=14, use_nan=True) -> pd.Series:
df_ = data.copy(deep=True)
df_.loc[:, 'H_L'] = df_['High'] - df_['Low']
df_.loc[:, 'H_Cp'] = abs(df_['High'] - df_['Close'].shift(1))
df_.loc[:, 'L_Cp'] = abs(df_['Low'] - df_['Close'].shift(1))
df_.loc[:, 'TR'] = df_[["H_L", "H_Cp", "L_Cp"]].max(axis=1)
df_.loc[:, 'ATR'] = df_['TR'].rolling(window).mean()
for i in range(window, len(df_)):
df_.iloc[i, df_.columns.get_loc('ATR')] = (((df_.iloc[i - 1, df_.columns.get_loc('ATR')]) * (window - 1)) + df_.iloc[
i, df_.columns.get_loc('TR')]) / window
if use_nan:
df_.iloc[:window, df_.columns.get_loc('ATR')] = np.nan
return df_['ATR']

Return integer value for time series dataframe

I need to create a function whereby it returns a value for each row. This function is based on 2 other functions. For each entry in dataframe
def computeRSI (data, time_window = 14):
diff = data.diff(1).dropna()
up_chg = 0 * diff
down_chg = 0 * diff
up_chg[diff > 0] = diff[diff>0]
down_chg[diff < 0] = diff[diff<0]
up_chg_avg = up_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
down_chg_avg = down_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
rs = abs(up_chg_avg/down_chg_avg)
rsi = 100 - 100/(1+rs)
return rsi
def computeCCI(data, time_window = 20):
TP = data.diff(1).dropna()
CCI = pd.Series((TP - TP.rolling(time_window).mean()) / (0.015 * TP.rolling(time_window).std()), name = 'CCI_' + str(time_window))
return CCI
def composite_indicator(data):
diff = data.diff(1).dropna()
if computeRSI(diff)>70 and computeCCI(diff)>100:
return '+2'
elif computeRSI(diff)>60 and computeCCI(diff)>50:
return '+1'
elif computeRSI(diff)<30 and computeCCI(diff)<-100:
return '-2'
elif computeRSI(diff)<40 and computeCCI(diff)<-50:
return '-1'
else:
return '0'
My composite indicator function is giving me :
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
I would like for there to return +2,+1,-2,-1,0enter image description here

Why do I have to use a.any() or a.all() in this code?

In this code below, I found that when I put a number it works, but when I put ndarray then it would post an error message.
Why do I have to use a.any() or a.all() in this case?
import numpy as np
def ht(x):
if x%2 == 1:
return 1
else:
return 0
ht(1)
[Example]
step(1): 1
step(np.array([1,2,3,4])) : The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
when evaluating if statements, you have to pass in a bool.
if var:
pass
var has to be of type bool.
if x is a number, then x%2 == 1 is a bool.
if x is a np.array, then x%2 == 1 is a np.array which isn't a bool, but rather an array of bool, in which each cell states whether *that cell* %2 == 1.
You can check if all elements in it are truthy (1) or if any of them are truthy with np.all or np.any.
This is because when np.array([1,2,3,4])%2 is performed, the output is also in np array format - array([1, 0, 1, 0]). To check whether these individual array elements are 1 or 0, one has to use the any() or all() function. There is no problem when we pass a single element.
So, here is the modified code -
import numpy as np
def ht(x):
if all(x%2 == 1): #denotes true when all modulus results are == 1
return 1
else:
return 0
ht(np.array([1,2,3,4]))
Output for the above code is 0
import numpy as np
def ht(x):
if any(x%2 == 1): #denotes true when any modulus result is == 1
return 1
else:
return 0
ht(np.array([1,2,3,4]))
Output for the above code is 1

iterating pandas rows using .apply()

I wanted to iterate through the pandas data frame but for some reason it does not work with .apply() method.
train = pd.read_csv('../kaggletrain')
pclass = train['Pclass']
# pclass has list of data with either 1, 2 or 3..
# so wanted to return if the cell is 1 then return True or everything False
def abc(pclass):
if pclass == 1:
return True
else:
return False
ABCDEFG = train.apply(abc, axis=1)
This gives valueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Thank you for your help
ABCDEFG = train[train['pclass']==1]

how to compare the values of two columns using condition, and assign a value when that condition is met

I want to compare the home_score and away_score column values and if homescore<awayscore assigning homeloss , if homescore>awayscore assigning homewin and if homescore = awayscore assingning draw in new columns
era1800_1900 = era(eras,1800,1900)
era1800_1900["result"] = era1800_1900[(era1800_1900["home_score"] < era1800_1900["away_score"] == "Lose")]
I expect another column result in my data frame with values homeloss, homewin and draw based on the condition scores but i get this error when i used the following code
--era1800_1900 = era(eras,1800,1900)
era1800_1900["result"] = era1800_1900[(era1800_1900["home_score"] < era1800_1900["away_score"] == "Lose")]------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-78-58ef8c4a0715> in <module>
1 era1800_1900 = era(eras,1800,1900)
----> 2 era1800_1900["result"] = era1800_1900[(era1800_1900["home_score"] < era1800_1900["away_score"] == "Lose")]
~\Anaconda3 new\lib\site-packages\pandas\core\generic.py in __nonzero__(self)
1574 raise ValueError("The truth value of a {0} is ambiguous. "
1575 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
-> 1576 .format(self.__class__.__name__))
1577
1578 __bool__ = __nonzero__
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Try the following approach:
era['result'] = None
era.loc[era[era['A'] < era['B']].index.values,'result'] = 'homelose'
era.loc[era[era['A'] > era['B']].index.values,'result'] = 'homewin'
era.loc[era[era['A'] < era['B']].index.values,'result'] = 'homedraw'
If you are comfortable with functions, look at this example