Value from iterative function in pandas - pandas

I have a dataframe and would like to have the values in one column being set through an iterative function as below.
import pandas as pd
import numpy as np
d = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7]}
df = pd.DataFrame(data=d)
df['col4'] = df['col1'] * df['col3']/4
def func(df):
a = np.exp(-df['col4'])
n = 1
while df['col2'] < a:
a = a + df['col4'] * 4 / n
n += 1
return n
df['col5'] = func(df)
I get an error message "ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." How can I run the function per row to solve the series/ambiguity problem?
EDIT: Added expected output.
out = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7],
'col4': [0.7777, 44.557625],
'col5': [0, 49]}
dfout = pd.DataFrame(out)
I am not sure what the values in col4 and col5 will be but according to the calculation I am trying to replicate those will be the values.
EDIT2: I had missed n+=1 in the while loop. added it now.
EDIT3: I am trying to apply
f(0) = e^-col4
f(n) = col4 * f(n-1) / n for n > 0
until f > col2 and then return the value of n per row.

Using the information you provided, this seems to be the solution:
import pandas as pd
import numpy as np
d = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7]}
df = pd.DataFrame(data=d)
df['col4'] = df['col1'] * df['col3']/4
def func(df):
n = 1
return n
df['col5'] = func(df)

For what it is worth, here is an inefficient solution: after each iteration, keep track of which coefficient starts satisfying the condition.
import pandas as pd
import numpy as np
d = {'col1': [0.4444, 25.4615],
'col2': [0.5, 0.7],
'col3': [7, 7]}
df = pd.DataFrame(data=d)
df['col4'] = df['col1'] * df['col3']/4
def func(df):
a = np.exp(-df['col4'])
n = 1
ns = [None] * len(df['col2'])
status = a > df['col2']
for i in range(len(status)):
if ns[i] is None and status[i]:
ns[i] = n
# stops when all coefficients satisfy the condition
while not status.all():
a = a * df['col4'] * n
status = a > df['col2']
n += 1
for i in range(len(status)):
if ns[i] is None and status[i]:
ns[i] = n
return ns
df['col5'] = func(df)
print(df['col5'])

Related

Two Pandas dataframes, how to interpolate row-wise using scipy

How can I use scipy interpolate on two dataframes, interpolating row-rise?
For example, if I have:
dfx = pd.DataFrame({"a": [0.1, 0.2, 0.5, 0.6], "b": [3.2, 4.1, 1.1, 2.8]})
dfy = pd.DataFrame({"a": [0.8, 0.2, 1.1, 0.1], "b": [0.5, 1.3, 1.3, 2.8]})
display(dfx)
display(dfy)
And say I want to interpolate for y(x=0.5), how can I get the results into an array that I can put in a new dataframe?
Expected result is: [0.761290323 0.284615385 1.1 -0.022727273]
For example, for first row, you can see the expected value is 0.761290323:
x = [0.1, 3.2] # from dfx, row 0
y = [0.8,0.5] # from dfy, row 0
fig, ax = plt.subplots(1,1)
ax.plot(x,y)
f = scipy.interpolate.interp1d(x,y)
out = f(0.5)
print(out)
I tried the following but received ValueError: x and y arrays must be equal in length along interpolation axis.
f = scipy.interpolate.interp1d(dfx, dfy)
out = np.exp(f(0.5))
print(out)
Since you are looking for linear interpolation, you can do:
def interpolate(val, dfx, dfy):
t = (dfx['b'] - val) / (dfx['b'] - dfx['a'])
return dfy['a'] * t + dfy['b'] * (1-t)
interpolate(0.5, dfx, dfy)
Output:
0 0.885714
1 0.284615
2 1.100000
3 -0.022727
dtype: float64

Numpy equivalent of pandas replace (dictionary mapping)

I know working on numpy array can be quicker than pandas.
I am wondering if there is a equivalent way (and quicker) to do pandas.replace on a numpy array.
In the example below, I have created a dataframe and a dictionary. The dictionary contains the name of columns and its corresponding mapping. I wonder if there is any function which would allow me to feed a dicitonary to a numpy array to do the mapping and yield a quicker processing time?
import pandas as pd
import numpy as np
# Dataframe
d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
df = pd.DataFrame(data=d)
# dictionary I want to map
d_mapping = {'col1' : {1:2 , 2:1} , 'col2' : {4:1}}
# result using pandas replace
print(df.replace(d_mapping))
# Instead of a pandas dataframe, I want to perform the same operation on a numpy array
df_np = df.to_records(index=False)
You can try np.select(). I believe it depends on the number of unique elements to replace.
def replace_values(df, d_mapping):
def replace_col(col):
# extract numpy array and column name from pd.Series
col, name = col.values, col.name
# generate condlist and choicelist
# for every key in mapping create a boolean mask
condlist = [col == x for x in d_mapping[name].keys()]
choicelist = d_mapping[name].values()
# use np.where to keep the existing value which won't be replaced
return np.select(condlist, choicelist, col)
return df.apply(replace_col)
usage:
replace_values(df, d_mapping)
I also believe that you you can speed up the code above if you use lists/arrays in the mapping instead of dicts and replace keys(), and values() calls with index lookups:
d_mapping = {"col1": [[1, 2], [2, 1]], "col2": [[4], [1]]}
...
lookups and are also expensive
m = d_mapping[name]
condlist = [col == x for x in m[0]]
choicelist = m[1]
...
np.isin(col, m[0]),
Upd:
Here is the benchmark
import pandas as pd
import numpy as np
# Dataframe
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
# dictionary I want to map
d_mapping = {"col1": [[1, 2], [2, 1]], "col2": [[4], [1]]}
d_mapping_2 = {
col: dict(zip(*replacement)) for col, replacement in d_mapping.items()
}
def replace_values(df, mapping):
def replace_col(col):
col, (m0, m1) = col.values, mapping[col.name]
return np.select([col == x for x in m0], m1, col)
return df.apply(replace_col)
from timeit import timeit
print("np.select: ", timeit(lambda: replace_values(df, d_mapping), number=5000))
print("df.replace: ", timeit(lambda: df.replace(d_mapping_2), number=5000))
On my 6-year old laptop it prints:
np.select: 3.6562702230003197
df.replace: 4.714512745998945
np.select is ~20% faster

Getting column names in pandas that equal a value for a given row

I have a table like this:
Scen F1 F2 F3 F4
0 S1 1 0 1 0
1 S2 0 1 0 1
and want to search by Scen and return the column names that == 1 for that row, e.g. for S1 I require F1, F3 as the result.
I've tried the following, and can get the result by hard coding df_col[0] , but need to be able to do this dynamically.
What's the best way to do this?
import pandas as pd
d = {'Scen': ["S1", "S2"],
'F1': [1, 0],
'F2': [0, 1],
'F3': [1, 0],
'F4': [0, 1]
}
df = pd.DataFrame(data=d)
def get_features(df, col_name):
df_col = df[(df.Scen == col_name)].T
feats = (df_col[(df_col[0] == 1)]).index.to_list()
print(feats)
return feats
get_features(df, "S1")
get_features(df, "S2")
EDIT:
Based on RichieV 's answer, this works:
def get_features(df, col_name):
df = df.replace(0, np.nan)
df = df.melt('Scen')
df_scen = (df['variable'].loc[(df['Scen']==col_name) & (df['value']==1)])
return (list(df_scen))
This is a one-hot decoding operation. When you encode to ones you pivot a column, so now we need to melt it back.
df = df.replace(0, np.nan) # get rid of zeros, they only fill spaces
df = df.melt('Scen').drop('value', axis=1)
Now df has four rows and two columns (scen and variable) with repeated scen rows for each corresponding feature. You can use df as it is or group by scenario and gather features in a list.
df = df.groupby('Scen').apply(list)
This works, not sure it's the most efficient
import pandas as pd
d = {'Scen': ["S1", "S2"],
'F1': [1, 0],
'F2': [0, 1],
'F3': [1, 0],
'F4': [0, 1]
}
df = pd.DataFrame(data=d)
def get_features(df, col_name):
df_col = df[(df.Scen == col_name)].T
df_feats = df_col.loc[df_col[df_col.columns.values[0]] == 1]
return (list(df_feats.index))
s1_list = get_features(df, "S1")
s2_list = get_features(df, "S2")
print(s1_list)
print(s2_list)
['F1', 'F3']
['F2', 'F4']

Vectorization of selective cumulative sum

I have a pandas Series where each element is a list with indices:
series_example = pd.Series([[1, 3, 2], [1, 2]])
In addition, I have an array with values associated to every index:
arr_example = np.array([3., 0.5, 0.25, 0.1])
I want to create a new Series with the cumulative sums of the elements of the array given by the indices in the row of the input Series. In the example, the output Series would have the following contents:
0 [0.5, 0.6, 0.85]
1 [0.5, 0.75]
dtype: object
The non-vectorized way to do it would be the following:
def non_vector_transform(series, array):
series_output = pd.Series(np.zeros(len(series_example)), dtype = object)
for i in range(len(series)):
element_list = series[i]
series_output[i] = []
acum = 0
for element in element_list:
acum += array[element]
series_output[i].append(acum)
return series_output
I would like to do this in a vectorized way. Any vectorization magician to help me in here?
Use Series.apply and np.cumsum:
import numpy as np
import pandas as pd
series_example = pd.Series([[1, 3, 2], [1, 2]])
arr_example = np.array([3., 0.5, 0.25, 0.1])
result = series_example.apply(lambda x: np.cumsum(arr_example[x]))
print(result)
Or if you prefer a for loop:
import numpy as np
import pandas as pd
series_example = pd.Series([[1, 3, 2], [1, 2]])
arr_example = np.array([3., 0.5, 0.25, 0.1])
# Copy only if you do not want to overwrite the original series
result = series_example.copy()
for i, x in result.iteritems():
result[i] = np.cumsum(arr_example[x])
print(result)
Output:
0 [0.5, 0.6, 0.85]
1 [0.5, 0.75]
dtype: object

Slice pandas' MultiIndex DataFrame

To keep track of all simulation-results in a parametric run, i create a MultIndex DataFrame named dfParRun in pandas as follows:
import pandas as pd
import numpy as np
import itertools
limOpt = [0.1,1,10]
reimbOpt = ['Cash','Time']
xOpt = [0.1, .02, .03, .04, .05, .06, .07, .08]
zOpt = [1,5n10]
arrays = [limOpt, reimbOpt, xOpt, zOpt]
parameters = list(itertools.product(*arrays))
nPar = len(parameters)
variables = ['X', 'Y', 'Z']
nVar = len(variables)
index = pd.MultiIndex.from_tuples(parameters, names=['lim', 'reimb', 'xMax', 'zMax'])
dfParRun = pd.DataFrame(np.random.rand((nPar, nVar)), index=index, columns=variables)
To analyse my parametric run, i want to slice this dataframe but this seems a burden. For example, i want to have all results for xMax above 0.5 and lim equal to 10. At this moment, the only working method i find is:
df = dfParRun.reset_index()
df.loc[(df.xMax>0.5) & (df.lim==10)]
and i wonder if there is a method without resetting the index of the DataFrame ?
option 1
use pd.IndexSlice
caveat: requires sort_index
dfParRun.sort_index().loc[pd.IndexSlice[10, :, .0500001:, :]]
option 2
use your df after having reset_index
df.query('xMax > 0.05 & lim == 10')
setup
import pandas as pd
import numpy as np
import itertools
limOpt = [0.1,1,10]
reimbOpt = ['Cash','Time']
xOpt = [0.1, .02, .03, .04, .05, .06, .07, .08]
zOpt = [1, 5, 10]
arrays = [limOpt, reimbOpt, xOpt, zOpt]
parameters = list(itertools.product(*arrays))
nPar = len(parameters)
variables = ['X', 'Y', 'Z']
nVar = len(variables)
index = pd.MultiIndex.from_tuples(parameters, names=['lim', 'reimb', 'xMax', 'zMax'])
dfParRun = pd.DataFrame(np.random.rand(*(nPar, nVar)), index=index, columns=variables)
df = dfParRun.reset_index()