I would like to take a dataframe and concatenate consecutive rows for comparison.
e.g.
Take
xyt = pd.DataFrame(np.concatenate((np.random.randn(3,2), np.arange(3).reshape((3, 1))), axis=1), columns=['x','y','t'])
Which looks something like:
x y t
0 1.237007 -1.035837 0.0
1 -1.782458 1.042942 1.0
2 0.063130 0.355014 2.0
And make:
a b
x y t x y t
0 1.237007 -1.035837 0.0 -1.782458 1.042942 1.0
1 -1.782458 1.042942 1.0 0.063130 0.355014 2.0
The best I could come up with was:
pd.DataFrame(
[np.append(x,y) for (x, y) in zip(xyt.values, xyt[1:].values)],
columns=pd.MultiIndex.from_product([('a', 'b'), xyt.columns]))
Is there a better way?
Let's try concat on axis=1 with the shifted frame:
import pandas as pd
xyt = pd.DataFrame({'x': {0: 1.237007, 1: -1.782458, 2: 0.06313},
'y': {0: -1.035837, 1: 1.042942, 2: 0.355014},
't': {0: 0.0, 1: 1.0, 2: 2.0}})
merged = pd.concat((xyt, xyt.shift(-1)), axis=1, keys=('a', 'b')).iloc[:-1]
print(merged)
merged:
a b
x y t x y t
0 1.237007 -1.035837 0.0 -1.782458 1.042942 1.0
1 -1.782458 1.042942 1.0 0.063130 0.355014 2.0
You can use pd.concat:
# Generate random data
n = 10
x, y = np.random.randn(2, n)
t = np.arange(n)
xyt = pd.DataFrame({
'x': x, 'y': y, 't': t
})
# The call
pd.concat([xyt, xyt.shift(-1)], axis=1, keys=['a','b'])
# Result
a b
x y t x y t
0 1.180544 1.707380 0 -0.227370 0.734225 1.0
1 -0.227370 0.734225 1 0.271997 -1.039424 2.0
2 0.271997 -1.039424 2 -0.729960 -1.081224 3.0
3 -0.729960 -1.081224 3 0.185301 0.530126 4.0
4 0.185301 0.530126 4 -0.175333 -0.126157 5.0
5 -0.175333 -0.126157 5 -0.634870 0.068683 6.0
6 -0.634870 0.068683 6 0.350867 0.361564 7.0
7 0.350867 0.361564 7 0.090678 -0.269504 8.0
8 0.090678 -0.269504 8 0.177076 -0.976640 9.0
9 0.177076 -0.976640 9 NaN NaN NaN
Related
I have a dataframe which uses an identifier for groups and has several columns with missing values.
toy_df = pd.DataFrame({'ID':[1,1, 1, 2, 2, 2, 3, 3, 3],
'Age': [10, 10, 10, 20, 20, 20, 20, 20, 20],
'A': [np.nan, 5, 5, np.nan, np.nan, np.nan, 10, 12, 12],
'B': [3, 4, 5, 2, 2, 1, np.nan, 4, 3]})
ID Age A B
0 1 10 NaN 3.0
1 1 10 5.0 4.0
2 1 10 5.0 5.0
3 2 20 NaN 2.0
4 2 20 NaN 2.0
5 2 20 NaN 1.0
6 3 20 10.0 NaN
7 3 20 12.0 4.0
8 3 20 12.0 3.0
Now I want to fill NaNs by some rules either within groups of same age or just within the ID group
group_mode = toy_df.groupby('Age')['A'].apply(lambda x: list(x.mode()))
group_median = toy_df.groupby('Age')['A'].median()
def impute_column(series, group_mode, group_median, agg_key, key, age):
if series.isna().sum() == series.shape[0]:
modes = group_mode[group_mode.index == age]
#if multiple modes are available use median
if np.ravel(modes.to_list()).shape[0] > 1:
median_ = group_median[group_median.index == age]
series = series.fillna(value = median_)
else:
mode_ = modes.item()[0]
series = series.fillna(value = mode_)
#if up to 3 values are missing use linear interpolation
elif series.isna().sum() < 4:
series = series.interpolate(limit_direction='both', method='linear')
#else we have sparse values / use median
else:
median_ = series.median()
series = series.fillna(value = median_)
return series
And if I test it with one of the columns & groups it works
impute_column(series=toy_df['A'], group_mode=group_mode, group_median=group_median,
agg_key='Age', key = 'A', age=10)
0 10.0
1 5.0
2 5.0
3 10.0
4 10.0
5 10.0
6 10.0
7 12.0
8 12.0
Now I want to be efficient and
group over IDs
loop over the grouped object
loop over all columns
and update my dataframe
all_columns = ['A', 'B']
grouped = toy_df.groupby('ID')
for key in all_columns:
group_mode = toy_df.groupby('Age')[key].apply(lambda x: list(x.mode()))
group_median = toy_df.groupby('Age')[key].median()
for _, group in grouped:
age = group['Age'].iloc[0]
group[key] = impute_column(series=group[key], group_mode=group_mode, group_median=group_median,
agg_key='Age', key=key, age=age)
The calculations are running (I've printed them out), but the final dataframe ain't updated
ID Age A B
0 1 10 NaN 3.0
1 1 10 5.0 4.0
2 1 10 5.0 5.0
3 2 20 NaN 2.0
4 2 20 NaN 2.0
5 2 20 NaN 1.0
6 3 20 10.0 NaN
7 3 20 12.0 4.0
8 3 20 12.0 3.0
what seems to work is the code below.
but as you can see, it does not follow the bullet points above. Further I am pretty sure, that computing big groupby objects for each group is immensely inefficient
def impute_column(series, group_mode, group_median, key, age):
if series.isna().sum() == series.shape[0]:
modes = group_mode[group_mode.index == age]
#if multiple modes are available use median
if np.ravel(modes.to_list()).shape[0] > 1:
median_ = group_median[group_median.index == age]
series = series.fillna(value = median_)
else:
mode_ = modes.item()[0]
series = series.fillna(value = mode_)
#if up to 3 values are missing use linear interpolation
elif series.isna().sum() < 4:
series = series.interpolate(limit_direction='both', method='linear')
#else we have sparse values / use median
else:
median_ = series.median()
series = series.fillna(value = median_)
return series
def impute_frame(data, full_data, agg_key):
age = data['Age'].iloc[0]
for key in ['A', 'B']:
group_mode = full_data.groupby(agg_key)[key].apply(lambda x: list(x.mode()))
group_median = full_data.groupby(agg_key)[key].median()
data[key] = impute_column(data[key], group_mode, group_median, key, age)
return data
toy_df.groupby('ID').apply(impute_frame, full_data=toy_df, agg_key='Age')
ID Age A B
0 1 10 5.0 3.0
1 1 10 5.0 4.0
2 1 10 5.0 5.0
3 2 20 12.0 2.0
4 2 20 12.0 2.0
5 2 20 12.0 1.0
6 3 20 10.0 4.0
7 3 20 12.0 4.0
8 3 20 12.0 3.0
I want to divide a dataframe by a number:
df = df/10
Is there a way to do this in a method chain?
# idea:
df = df.filter(['a','b']).query("a>100").assign(**divide by 10)
We can use DataFrame.div here:
df = df[['a','b']].query("a>100").div(10)
a b
0 40.0 0.7
1 50.0 0.8
5 70.0 0.3
Use DataFrame.pipe with lambda function for use some function for all data of DataFrame:
df = pd.DataFrame({
'a':[400,500,40,50,5,700],
'b':[7,8,9,4,2,3],
'c':[1,3,5,7,1,0],
'd':[5,3,6,9,2,4]
})
df = df.filter(['a','b']).query("a>100").pipe(lambda x: x / 10)
print (df)
a b
0 40.0 0.7
1 50.0 0.8
5 70.0 0.3
Here if use apply all columns are divided separately:
df = df.filter(['a','b']).query("a>100").apply(lambda x: x / 10)
You can see difference with print:
df1 = df.filter(['a','b']).query("a>100").pipe(lambda x: print (x))
a b
0 400 7
1 500 8
5 700 3
df2 = df.filter(['a','b']).query("a>100").apply(lambda x: print (x))
0 400
1 500
5 700
Name: a, dtype: int64
0 7
1 8
5 3
Name: b, dtype: int64
I have this pandas df:
value
index1 index2 index3
1 1 1 10.0
2 -0.5
3 0.0
2 2 1 3.0
2 0.0
3 0.0
3 1 0.0
2 -5.0
3 6.0
I would like to get the 'value' of a specific combination of index, using a dict.
Usually, I use, for example:
df = df.iloc[df.index.isin([2],level='index1')]
df = df.iloc[df.index.isin([3],level='index2')]
df = df.iloc[df.index.isin([2],level='index3')]
value = df.values[0][0]
Now, I would like to get my value = -5 in a shorter way using this dictionary:
d = {'index1':2,'index2':3,'index3':2}
And also, if I use:
d = {'index1':2,'index2':3}
I would like to get the array:
[0.0, -5.0, 6.0]
Tips?
You can use SQL-like method DataFrame.query():
In [69]: df.query(' and '.join('{}=={}'.format(k,v) for k,v in d.items()))
Out[69]:
value
index1 index2 index3
2.0 3.0 2 -5.0
for another dict:
In [77]: d = {'index1':2,'index2':3}
In [78]: df.query(' and '.join('{}=={}'.format(k,v) for k,v in d.items()))
Out[78]:
value
index1 index2 index3
2.0 3.0 1 0.0
2 -5.0
3 6.0
A non-query way would be
In [64]: df.loc[np.logical_and.reduce([
df.index.get_level_values(k) == v for k, v in d.items()])]
Out[64]:
value
index1 index2 index3
2 3 2 -5.0
Suppose I have two-leveled multi-indexed dataframe
In [1]: index = pd.MultiIndex.from_tuples([(i,j) for i in range(3)
: for j in range(1+i)], names=list('ij') )
: df = pd.DataFrame(0.1*np.arange(2*len(index)).reshape(-1,2),
: columns=list('xy'), index=index )
: df
Out[1]:
x y
i j
0 0 0.0 0.1
1 0 0.2 0.3
1 0.4 0.5
2 0 0.6 0.7
1 0.8 0.9
2 1.0 1.1
And I want to run a custom function on every sub-dataframe:
In [2]: def my_aggr_func(subdf):
: return subdf['x'].mean() / subdf['y'].mean()
:
: level0 = df.index.levels[0].values
: pd.DataFrame({'mean_ratio': [my_aggr_func(df.loc[i]) for i in level0]},
: index=pd.Index(level0, name=index.names[0]) )
Out[2]:
mean_ratio
i
0 0.000000
1 0.750000
2 0.888889
Is there an elegant way to do it with df.groupby('i').agg(__something__) or something similar?
Need GroupBy.apply, which working with DataFrame:
df1 = df.groupby('i').apply(my_aggr_func).to_frame('mean_ratio')
print (df1)
mean_ratio
i
0 0.000000
1 0.750000
2 0.888889
You don't need the custom function. You can calculate the 'within group means' with agg then perform an eval to get the ratio you want.
df.groupby('i').agg('mean').eval('x / y')
i
0 0.000000
1 0.750000
2 0.888889
dtype: float64
What do the following assignments behave differently?
df.loc[rows, [col]] = ...
df.loc[rows, col] = ...
For example:
r = pd.DataFrame({"response": [1,1,1],},index = [1,2,3] )
df = pd.DataFrame({"x": [999,99,9],}, index = [3,4,5] )
df = pd.merge(df, r, how="left", left_index=True, right_index=True)
df.loc[df["response"].isnull(), "response"] = 0
print df
x response
3 999 0.0
4 99 0.0
5 9 0.0
but
df.loc[df["response"].isnull(), ["response"]] = 0
print df
x response
3 999 1.0
4 99 0.0
5 9 0.0
why should I expect the first to behave differently to the second?
df.loc[df["response"].isnull(), ["response"]]
returns a DataFrame, so if you want to assign something to it it must be aligned by both index and columns
Demo:
In [79]: df.loc[df["response"].isnull(), ["response"]] = \
pd.DataFrame([11,12], columns=['response'], index=[4,5])
In [80]: df
Out[80]:
x response
3 999 1.0
4 99 11.0
5 9 12.0
alternatively you can assign an array/matrix of the same shape:
In [83]: df.loc[df["response"].isnull(), ["response"]] = [11, 12]
In [84]: df
Out[84]:
x response
3 999 1.0
4 99 11.0
5 9 12.0
I'd also consider using fillna() method:
In [88]: df.response = df.response.fillna(0)
In [89]: df
Out[89]:
x response
3 999 1.0
4 99 0.0
5 9 0.0