Pandas: Loop over GroupBy / Two aggregations / Fillna not inplace - pandas

I have a dataframe which uses an identifier for groups and has several columns with missing values.
toy_df = pd.DataFrame({'ID':[1,1, 1, 2, 2, 2, 3, 3, 3],
'Age': [10, 10, 10, 20, 20, 20, 20, 20, 20],
'A': [np.nan, 5, 5, np.nan, np.nan, np.nan, 10, 12, 12],
'B': [3, 4, 5, 2, 2, 1, np.nan, 4, 3]})
ID Age A B
0 1 10 NaN 3.0
1 1 10 5.0 4.0
2 1 10 5.0 5.0
3 2 20 NaN 2.0
4 2 20 NaN 2.0
5 2 20 NaN 1.0
6 3 20 10.0 NaN
7 3 20 12.0 4.0
8 3 20 12.0 3.0
Now I want to fill NaNs by some rules either within groups of same age or just within the ID group
group_mode = toy_df.groupby('Age')['A'].apply(lambda x: list(x.mode()))
group_median = toy_df.groupby('Age')['A'].median()
def impute_column(series, group_mode, group_median, agg_key, key, age):
if series.isna().sum() == series.shape[0]:
modes = group_mode[group_mode.index == age]
#if multiple modes are available use median
if np.ravel(modes.to_list()).shape[0] > 1:
median_ = group_median[group_median.index == age]
series = series.fillna(value = median_)
else:
mode_ = modes.item()[0]
series = series.fillna(value = mode_)
#if up to 3 values are missing use linear interpolation
elif series.isna().sum() < 4:
series = series.interpolate(limit_direction='both', method='linear')
#else we have sparse values / use median
else:
median_ = series.median()
series = series.fillna(value = median_)
return series
And if I test it with one of the columns & groups it works
impute_column(series=toy_df['A'], group_mode=group_mode, group_median=group_median,
agg_key='Age', key = 'A', age=10)
0 10.0
1 5.0
2 5.0
3 10.0
4 10.0
5 10.0
6 10.0
7 12.0
8 12.0
Now I want to be efficient and
group over IDs
loop over the grouped object
loop over all columns
and update my dataframe
all_columns = ['A', 'B']
grouped = toy_df.groupby('ID')
for key in all_columns:
group_mode = toy_df.groupby('Age')[key].apply(lambda x: list(x.mode()))
group_median = toy_df.groupby('Age')[key].median()
for _, group in grouped:
age = group['Age'].iloc[0]
group[key] = impute_column(series=group[key], group_mode=group_mode, group_median=group_median,
agg_key='Age', key=key, age=age)
The calculations are running (I've printed them out), but the final dataframe ain't updated
ID Age A B
0 1 10 NaN 3.0
1 1 10 5.0 4.0
2 1 10 5.0 5.0
3 2 20 NaN 2.0
4 2 20 NaN 2.0
5 2 20 NaN 1.0
6 3 20 10.0 NaN
7 3 20 12.0 4.0
8 3 20 12.0 3.0
what seems to work is the code below.
but as you can see, it does not follow the bullet points above. Further I am pretty sure, that computing big groupby objects for each group is immensely inefficient
def impute_column(series, group_mode, group_median, key, age):
if series.isna().sum() == series.shape[0]:
modes = group_mode[group_mode.index == age]
#if multiple modes are available use median
if np.ravel(modes.to_list()).shape[0] > 1:
median_ = group_median[group_median.index == age]
series = series.fillna(value = median_)
else:
mode_ = modes.item()[0]
series = series.fillna(value = mode_)
#if up to 3 values are missing use linear interpolation
elif series.isna().sum() < 4:
series = series.interpolate(limit_direction='both', method='linear')
#else we have sparse values / use median
else:
median_ = series.median()
series = series.fillna(value = median_)
return series
def impute_frame(data, full_data, agg_key):
age = data['Age'].iloc[0]
for key in ['A', 'B']:
group_mode = full_data.groupby(agg_key)[key].apply(lambda x: list(x.mode()))
group_median = full_data.groupby(agg_key)[key].median()
data[key] = impute_column(data[key], group_mode, group_median, key, age)
return data
toy_df.groupby('ID').apply(impute_frame, full_data=toy_df, agg_key='Age')
ID Age A B
0 1 10 5.0 3.0
1 1 10 5.0 4.0
2 1 10 5.0 5.0
3 2 20 12.0 2.0
4 2 20 12.0 2.0
5 2 20 12.0 1.0
6 3 20 10.0 4.0
7 3 20 12.0 4.0
8 3 20 12.0 3.0

Related

Pandas xs where level in list of options

If I have a pd.DataFrame that looks like:
new_df = []
for i in range(10):
df_example = pd.DataFrame(np.random.normal(size=[10,1]))
cols = [round(np.random.uniform(low=0,high=10)),round(np.random.uniform(low=0,high=10)),
round(np.random.uniform(low=0,high=10)),round(np.random.uniform(low=0,high=10))]
keys = ['A','B','C','D']
new_ix = pd.MultiIndex.from_tuples([cols],names=keys)
df_example.columns = new_ix
new_df.append(df_example)
new_df = pd.concat(new_df,axis=1)
Which could yield something like:
Now, if I want where C=4 and A=1 I can do:
df.xs(axis=1,level=['A','C'],key=[1,4])
How do I express if I want:
C in [4,2] and A in [5,2]
C in [4,2] or A in [5,2]
To the best of my knowledge, you can't use anything but tuples for key parameter in xs, so such queries are not possible.
The next best thing is to define helper functions for that purpose, such as the following:
def xs_or(df: pd.DataFrame, params: dict[str, list[int]]) -> pd.DataFrame:
"""Helper function.
Args:
df: input dataframe.
params: columns/values to query.
Returns:
Filtered dataframe.
"""
df = pd.concat(
[
df.xs(axis=1, level=[level], key=(key,))
for level, keys in params.items()
for key in keys
],
axis=1,
)
for level in params.keys():
try:
df = df.droplevel([level], axis=1)
except KeyError:
pass
return df
def xs_and(df: pd.DataFrame, params: dict[str, list[int]]) -> pd.DataFrame:
"""Helper function.
Args:
df: input dataframe.
params: columns/values to query.
Returns:
Filtered dataframe.
"""
for level, keys in params.items():
df = xs_or(df, {level: keys})
return df
And so, with the following dataframe named df:
A 4 7 3 1 7 9 4 0 3 9
B 6 7 4 6 7 5 8 0 8 0
C 2 10 5 2 9 9 4 3 4 5
D 0 1 7 3 8 3 6 7 9 10
0 -0.199458 1.155345 1.298027 0.575606 0.785291 -1.126484 0.019082 1.765094 0.034631 -0.243635
1 1.173873 0.523277 -0.709546 1.378983 0.266661 1.626118 1.647584 -0.228162 -1.708271 0.111583
2 0.321156 0.049470 -0.611111 -1.238887 1.092369 0.019503 -0.473618 1.804474 -0.850320 -0.217921
3 0.339307 -0.758909 0.072159 1.636119 -0.541920 -0.160791 -1.131100 1.081766 -0.530082 -0.546489
4 -1.523110 -0.662232 -0.434115 1.698073 0.568690 0.836359 -0.833581 0.230585 0.166119 1.085600
5 0.020645 -1.379587 -0.608083 -1.455928 1.855402 1.714663 -0.739409 1.270043 1.650138 -0.718430
6 1.280583 -1.317288 0.899278 -0.032213 -0.347234 2.543415 0.272228 -0.664116 -1.404851 -0.517939
7 -1.201619 0.724669 -0.705984 0.533725 0.820124 0.651339 0.363214 0.727381 -0.282170 0.651201
8 1.829209 0.049628 0.655277 -0.237327 -0.007662 1.849530 0.095479 0.295623 -0.856162 -0.350407
9 -0.690613 1.419008 -0.791556 0.180751 -0.648182 0.240589 -0.247574 -1.947492 -1.010009 1.549234
You can filter like this:
# C in [10, 2] or A in [1, 0]
print(xs_or(df, {"C": [10, 2], "A": [1, 0]}))
# Output
B 7 6 2 3
D 1 0 3 3 7
0 1.155345 -0.199458 0.575606 0.575606 1.765094
1 0.523277 1.173873 1.378983 1.378983 -0.228162
2 0.049470 0.321156 -1.238887 -1.238887 1.804474
3 -0.758909 0.339307 1.636119 1.636119 1.081766
4 -0.662232 -1.523110 1.698073 1.698073 0.230585
5 -1.379587 0.020645 -1.455928 -1.455928 1.270043
6 -1.317288 1.280583 -0.032213 -0.032213 -0.664116
7 0.724669 -1.201619 0.533725 0.533725 0.727381
8 0.049628 1.829209 -0.237327 -0.237327 0.295623
9 1.419008 -0.690613 0.180751 0.180751 -1.947492
# C in [10, 2] and A in [1, 7]
print(xs_and(df, {"C": [10, 2], "A": [1, 7]}))
# Output
B 6 7
D 3 1
0 0.575606 1.155345
1 1.378983 0.523277
2 -1.238887 0.049470
3 1.636119 -0.758909
4 1.698073 -0.662232
5 -1.455928 -1.379587
6 -0.032213 -1.317288
7 0.533725 0.724669
8 -0.237327 0.049628
9 0.180751 1.419008

Concise way to concatenate consecutive rows in pandas

I would like to take a dataframe and concatenate consecutive rows for comparison.
e.g.
Take
xyt = pd.DataFrame(np.concatenate((np.random.randn(3,2), np.arange(3).reshape((3, 1))), axis=1), columns=['x','y','t'])
Which looks something like:
x y t
0 1.237007 -1.035837 0.0
1 -1.782458 1.042942 1.0
2 0.063130 0.355014 2.0
And make:
a b
x y t x y t
0 1.237007 -1.035837 0.0 -1.782458 1.042942 1.0
1 -1.782458 1.042942 1.0 0.063130 0.355014 2.0
The best I could come up with was:
pd.DataFrame(
[np.append(x,y) for (x, y) in zip(xyt.values, xyt[1:].values)],
columns=pd.MultiIndex.from_product([('a', 'b'), xyt.columns]))
Is there a better way?
Let's try concat on axis=1 with the shifted frame:
import pandas as pd
xyt = pd.DataFrame({'x': {0: 1.237007, 1: -1.782458, 2: 0.06313},
'y': {0: -1.035837, 1: 1.042942, 2: 0.355014},
't': {0: 0.0, 1: 1.0, 2: 2.0}})
merged = pd.concat((xyt, xyt.shift(-1)), axis=1, keys=('a', 'b')).iloc[:-1]
print(merged)
merged:
a b
x y t x y t
0 1.237007 -1.035837 0.0 -1.782458 1.042942 1.0
1 -1.782458 1.042942 1.0 0.063130 0.355014 2.0
You can use pd.concat:
# Generate random data
n = 10
x, y = np.random.randn(2, n)
t = np.arange(n)
xyt = pd.DataFrame({
'x': x, 'y': y, 't': t
})
# The call
pd.concat([xyt, xyt.shift(-1)], axis=1, keys=['a','b'])
# Result
a b
x y t x y t
0 1.180544 1.707380 0 -0.227370 0.734225 1.0
1 -0.227370 0.734225 1 0.271997 -1.039424 2.0
2 0.271997 -1.039424 2 -0.729960 -1.081224 3.0
3 -0.729960 -1.081224 3 0.185301 0.530126 4.0
4 0.185301 0.530126 4 -0.175333 -0.126157 5.0
5 -0.175333 -0.126157 5 -0.634870 0.068683 6.0
6 -0.634870 0.068683 6 0.350867 0.361564 7.0
7 0.350867 0.361564 7 0.090678 -0.269504 8.0
8 0.090678 -0.269504 8 0.177076 -0.976640 9.0
9 0.177076 -0.976640 9 NaN NaN NaN

python pandas divide dataframe in method chain

I want to divide a dataframe by a number:
df = df/10
Is there a way to do this in a method chain?
# idea:
df = df.filter(['a','b']).query("a>100").assign(**divide by 10)
We can use DataFrame.div here:
df = df[['a','b']].query("a>100").div(10)
a b
0 40.0 0.7
1 50.0 0.8
5 70.0 0.3
Use DataFrame.pipe with lambda function for use some function for all data of DataFrame:
df = pd.DataFrame({
'a':[400,500,40,50,5,700],
'b':[7,8,9,4,2,3],
'c':[1,3,5,7,1,0],
'd':[5,3,6,9,2,4]
})
df = df.filter(['a','b']).query("a>100").pipe(lambda x: x / 10)
print (df)
a b
0 40.0 0.7
1 50.0 0.8
5 70.0 0.3
Here if use apply all columns are divided separately:
df = df.filter(['a','b']).query("a>100").apply(lambda x: x / 10)
You can see difference with print:
df1 = df.filter(['a','b']).query("a>100").pipe(lambda x: print (x))
a b
0 400 7
1 500 8
5 700 3
df2 = df.filter(['a','b']).query("a>100").apply(lambda x: print (x))
0 400
1 500
5 700
Name: a, dtype: int64
0 7
1 8
5 3
Name: b, dtype: int64

'float' object has no attribute 'split'

I have a pandas data-frame with a column with float numbers. I tried to split each item in a column by dot '.'. Then I want to add first items to second items. I don't know why this sample code is not working.
data=
0 28.47000
1 28.45000
2 28.16000
3 28.29000
4 28.38000
5 28.49000
6 28.21000
7 29.03000
8 29.11000
9 28.11000
new_array = []
df = list(data)
for i in np.arange(len(data)):
df1 = df[i].split('.')
df2 = df1[0]+df[1]/60
new_array=np.append(new_array,df2)
Use numpy.modf with DataFrame constructor:
arr = np.modf(data.values)
df = pd.DataFrame({'a':data, 'b':arr[1] + arr[0] / 60})
print (df)
a b
0 28.47 28.007833
1 28.45 28.007500
2 28.16 28.002667
3 28.29 28.004833
4 28.38 28.006333
5 28.49 28.008167
6 28.21 28.003500
7 29.03 29.000500
8 29.11 29.001833
9 28.11 28.001833
Detail:
arr = np.modf(data.values)
print(arr)
(array([ 0.47, 0.45, 0.16, 0.29, 0.38, 0.49, 0.21, 0.03, 0.11, 0.11]),
array([ 28., 28., 28., 28., 28., 28., 28., 29., 29., 28.]))
print(arr[0] / 60)
[ 0.00783333 0.0075 0.00266667 0.00483333 0.00633333 0.00816667
0.0035 0.0005 0.00183333 0.00183333]
EDIT:
df = pd.DataFrame({'a':data, 'b':arr[1] + arr[0]*5/3 })
print (df)
a b
0 28.47 28.783333
1 28.45 28.750000
2 28.16 28.266667
3 28.29 28.483333
4 28.38 28.633333
5 28.49 28.816667
6 28.21 28.350000
7 29.03 29.050000
8 29.11 29.183333
9 28.11 28.183333
Your data types are floats, not strings, and so cannot be .split() (this is a string method). Instead you can look to use math.modf to 'split' a float into fractional and decimal parts
https://docs.python.org/3.6/library/math.html
import math
def process(x:float, divisor:int=60) -> float:
"""
Convert a float to its constituent parts. Divide the fractional part by the divisor, and then recombine creating a 'scaled fractional' part,
"""
b, a = math.modf(x)
c = a + b/divisor
return c
df['data'].apply(process)
Out[17]:
0 28.007833
1 28.007500
2 28.002667
3 28.004833
4 28.006333
5 28.008167
6 28.003500
7 29.000500
8 29.001833
9 28.001833
Name: data=, dtype: float64
Your other option is to convert them to strings, split, convert to ints and floats again, do some maths and then combine the floats. I'd rather keep the object as it is personally.

df.loc[rows, [col]] vs df.loc[rows, col] in assignment

What do the following assignments behave differently?
df.loc[rows, [col]] = ...
df.loc[rows, col] = ...
For example:
r = pd.DataFrame({"response": [1,1,1],},index = [1,2,3] )
df = pd.DataFrame({"x": [999,99,9],}, index = [3,4,5] )
df = pd.merge(df, r, how="left", left_index=True, right_index=True)
df.loc[df["response"].isnull(), "response"] = 0
print df
x response
3 999 0.0
4 99 0.0
5 9 0.0
but
df.loc[df["response"].isnull(), ["response"]] = 0
print df
x response
3 999 1.0
4 99 0.0
5 9 0.0
why should I expect the first to behave differently to the second?
df.loc[df["response"].isnull(), ["response"]]
returns a DataFrame, so if you want to assign something to it it must be aligned by both index and columns
Demo:
In [79]: df.loc[df["response"].isnull(), ["response"]] = \
pd.DataFrame([11,12], columns=['response'], index=[4,5])
In [80]: df
Out[80]:
x response
3 999 1.0
4 99 11.0
5 9 12.0
alternatively you can assign an array/matrix of the same shape:
In [83]: df.loc[df["response"].isnull(), ["response"]] = [11, 12]
In [84]: df
Out[84]:
x response
3 999 1.0
4 99 11.0
5 9 12.0
I'd also consider using fillna() method:
In [88]: df.response = df.response.fillna(0)
In [89]: df
Out[89]:
x response
3 999 1.0
4 99 0.0
5 9 0.0