pandas Groupby after groupby - pandas

df = pd.DataFrame({'A': [1,2,3,1,2,3], 'B': [10,10,11,10,10,15], 'key1':['a','b','a','b','c','c'],'key2':1})
df1 = pd.DataFrame({'A': [1,2,3,1,2,3], 'B': [100,100,110,100,100,150], 'key1':['a','c','b','a','a','c'],'key2':1})
dfn = pd.merge(df,df1,on='key2')
dfn_grouped = dfn.groupby('key1_y')
the list(dfn_grouped):
[('a', A_x B_x key1_x key2 A_y B_y key1_y
0 1 10 a 1 1 100 a
3 1 10 a 1 1 100 a
... ... ... ...
33 3 15 c 1 1 100 a
34 3 15 c 1 2 100 a),
('b', A_x B_x key1_x key2 A_y B_y key1_y
2 1 10 a 1 3 110 b
8 2 10 b 1 3 110 b
14 3 11 a 1 3 110 b
20 1 10 b 1 3 110 b
26 2 10 c 1 3 110 b
32 3 15 c 1 3 110 b),
('c', A_x B_x key1_x key2 A_y B_y key1_y
1 1 10 a 1 2 100 c
...... ... ....
35 3 15 c 1 3 150 c)]
now i need groupby the dfn_grouped by "key1_x" and concat to dict like A_x:A_y
key1_y key1_x A_X:A_Y
b a {'10':'110','11':110}
b b {'10':110}
b c {'10':110,'15':110}
// if A_x in dict append the A_y like:
// b e {'10':[11,12]}

Is this what you need?:
>> grouped = dfn.groupby(['key1_y','key1_x','A_x'])
>> dfg = pd.DataFrame(grouped.apply(lambda x: [a for a in x.A_y])).reset_index()
>> dfg.columns = [u'key1_y', u'key1_x', u'A_x', 'dic_values']
>> dfg['dic'] = [{a:b} for a,b in zip(dfg.A_x.values,dfg.dic_values.values)]
>> dfg.drop(['A_x','dic_values'],1,inplace=True)
>> g_dics = dfg.groupby(['key1_y','key1_x']).apply(lambda x: dict(sum(map(dict.items, [d for d in x.dic]), [])))
>> pd.DataFrame(g_dics).reset_index()

Related

Split and concatenate dataframe

So i have dataframe which looks like this one:
>>>df = pd.DataFrame({
'id': [i for i in range(5)],
'1': ['a', 'b', 'c', 'd', 'e'],
'2': ['f', 'g', 'h', 'i', 'g']
})
>>>df
id 1 2
0 0 a f
1 1 b g
2 2 c h
3 3 d i
4 4 e g
I want to convert this dataframe to following dataframe
>>>df_concatenated
id val
1 0 a
1 1 b
1 2 c
1 3 d
1 4 e
2 0 f
2 1 g
2 2 h
2 3 i
2 4 g
One way is to pd.melt
pd.melt(df, id_vars=['id'], value_vars=['1','2']).set_index('variable',append=True)
The other is by splitting by .loc accessor and concatenating. Long but it works
res1=df.iloc[:,[0,2]]
res1.columns=['id','val']
res=df.iloc[:,:2]
res.columns=['id','val']
res2=pd.concat([res1,res])
res2
variable id value
0 1 0 a
1 1 1 b
2 1 2 c
3 1 3 d
4 1 4 e
5 2 0 f
6 2 1 g
7 2 2 h
8 2 3 i
9 2 4 g
You can try this:
df = df.rename({"1":"val"},axis=1)
df_temp = df[["id","2"]]
df_temp = df_temp.rename({"2":"val"},axis=1)
df.drop("2",axis=1,inplace=True)
out_df = pd.concat([df,df_temp],axis=0).reset_index(drop=True)
print(out_df)
output:
id val
0 0 a
1 1 b
2 2 c
3 3 d
4 4 e
5 0 f
6 1 g
7 2 h
8 3 i
9 4 g

Concatenate alternate scalar column to pandas based on condition

Have a master dataframe and a tag list, as follows:
import pandas as pd
i = ['A'] * 2 + ['B'] * 3 + ['A'] * 4 + ['B'] * 5
master = pd.DataFrame(i, columns={'cat'})
tag = [0, 1]
How to insert a column of tags that is normal for cat: A, but reversed for cat: B? Expected output is:
cat tags
0 A 0
1 A 1
2 B 1
3 B 0
4 B 1
5 A 0
6 A 1
7 A 0
8 A 1
9 B 1
10 B 0
...
EDIT: Because is necessary processing each concsecutive group separately I try create general solution:
tag = ['a','b','c']
r = range(len(tag))
r1 = range(len(tag)-1, -1, -1)
print (dict(zip(r1, tag)))
{2: 'a', 1: 'b', 0: 'c'}
m1 = master['cat'].eq('A')
m2 = master['cat'].eq('B')
s = master['cat'].ne(master['cat'].shift()).cumsum()
master['tags'] = master.groupby(s).cumcount() % len(tag)
master.loc[m1, 'tags'] = master.loc[m1, 'tags'].map(dict(zip(r, tag)))
master.loc[m2, 'tags'] = master.loc[m2, 'tags'].map(dict(zip(r1, tag)))
print (master)
cat tags
0 A a
1 A b
2 B c
3 B b
4 B a
5 A a
6 A b
7 A c
8 A a
9 B c
10 B b
11 B a
12 B c
13 B b
Another approach is create DataFrame from tags and merge with left join:
tag = ['a','b','c']
s = master['cat'].ne(master['cat'].shift()).cumsum()
master['g'] = master.groupby(s).cumcount() % len(tag)
d = {'A': tag, 'B':tag[::-1]}
df = pd.DataFrame([(k,i,x)
for k, v in d.items()
for i, x in enumerate(v)], columns=['cat','g','tags'])
print (df)
cat g tags
0 A 0 a
1 A 1 b
2 A 2 c
3 B 0 c
4 B 1 b
5 B 2 a
master = master.merge(df, on=['cat','g'], how='left').drop('g', axis=1)
print (master)
cat tags
0 A a
1 A b
2 B c
3 B b
4 B a
5 A a
6 A b
7 A c
8 A a
9 B c
10 B b
11 B a
12 B c
13 B b
Idea is use numpy.tile for repeat tag values by number of matched values with integer division and then filtering by indexing and assign by both masks:
le = len(tag)
m1 = master['cat'].eq('A')
m2 = master['cat'].eq('B')
s1 = m1.sum()
s2 = m2.sum()
master.loc[m1, 'tags'] = np.tile(tag, s1 // le + le)[:s1]
#swapped order for m2 mask
master.loc[m2, 'tags'] = np.tile(tag[::-1], s2// le + le)[:s2]
print (master)
cat tags
0 A 0.0
1 A 1.0
2 B 1.0
3 B 0.0
4 B 1.0
5 A 0.0
6 A 1.0
7 A 0.0
8 A 1.0
IIUC, GroupBy.cumcount + Series.mod.
Then we invert the sequence where cat is B with Series.mask
s = df.groupby('cat').cumcount().mod(2)
df['tags'] = s.mask(df['cat'].eq('B'), ~s.astype(bool)).astype(int)
print(df)
cat tags
0 A 0
1 A 1
2 B 1
3 B 0
4 B 1
5 A 0
6 A 1
7 A 0
8 A 1
numpy place might help here :
#create temp column :
mapp={'A':0,'B':1}
res = (master.assign(temp=master.cat.map(mapp),
tag = master.cat
)
)
#locate point where B changes to A
split_point = res.loc[res.temp.diff().eq(-1)].index
split_point
Int64Index([5], dtype='int64')
#split into sections :
spl = np.split(res.cat,split_point)
def replace(entry):
np.place(entry.to_numpy(), entry=="A",[0,1])
np.place(entry.to_numpy(),entry=="B",[1,0])
return entry
res.tag = pd.concat(map(replace,spl))
res.drop('temp',axis=1)
cat tag
0 A 0
1 A 1
2 B 1
3 B 0
4 B 1
5 A 0
6 A 1
7 A 0
8 A 1
9 B 1
10 B 0
11 B 1
12 B 0
13 B 1

Select dataframe columns based on column values in Pandas

My dataframe looks like:
A B C D .... Y Z
0 5 12 14 4 2
3 6 15 10 1 30
2 10 20 12 5 15
I want to create another dataframe that only contains the columns with an average value greater than 10:
C D .... Z
12 14 2
15 10 30
20 12 15
Use:
df = df.loc[:, df.mean() > 10]
print (df)
C D Z
0 12 14 2
1 15 10 30
2 20 12 15
Detail:
print (df.mean())
A 1.666667
B 7.000000
C 15.666667
D 12.000000
Y 3.333333
Z 15.666667
dtype: float64
print (df.mean() > 10)
A False
B False
C True
D True
Y False
Z True
dtype: bool
Alternative:
print (df[df.columns[df.mean() > 10]])
C D Z
0 12 14 2
1 15 10 30
2 20 12 15
Detail:
print (df.columns[df.mean() > 10])
Index(['C', 'D', 'Z'], dtype='object')

Convert an indexed pandas matrix to a flat dataframe

Given the dataframe:
df = pd.DataFrame([['foo', 123, 4, 5, 0, 1], ['foo', 123, 4, 0, 9, 1], ['bar', 33, 0, 0, 3, 5]], columns=list('ABCDEF'))
[out]:
A B C D E F
0 foo 123 4 5 0 1
1 foo 123 4 0 9 1
2 bar 33 0 0 3 5
The goal is to sum certain columns ('C', 'D', 'E', F') using other columns ('A' and 'B') as keys to achieve:
A B C D E F
0 foo 123 8 5 9 2
2 bar 33 0 0 3 5
I've tried:
df.groupby(['A', 'B']).sum()
[out]:
C D E F
A B
bar 33 0 0 3 5
foo 123 8 5 9 2
How do I change it back to the non-indexed matrix? i.e.
A B C D E F
0 foo 123 8 5 9 2
2 bar 33 0 0 3 5
You need to add .reset_index().
df.groupby(['A','B']).sum().reset_index()
A B C D E F
0 bar 33 0 0 3 5
1 foo 123 8 5 9 2
or
df.set_index(['A','B']).sum(level=[0,1]).reset_index()
A B C D E F
0 bar 33 0 0 3 5
1 foo 123 8 5 9 2
You can use parameter as_index=False for return df:
df1 = df.groupby(['A', 'B'], as_index=False).sum()
print (df1)
A B C D E F
0 bar 33 0 0 3 5
1 foo 123 8 5 9 2

Element wise multiplication of each row

I have two DataFrame objects which I want to apply an element-wise multiplication on each row onto:
df_prob_wc.shape # (3505, 13)
df_prob_c.shape # (13, 1)
I thought I could do it with DataFrame.apply()
df_prob_wc.apply(lambda x: x.multiply(df_prob_c), axis=1)
which gives me:
TypeError: ("'int' object is not iterable", 'occurred at index $')
or with
df_prob_wc.apply(lambda x: x * df_prob_c, axis=1)
which gives me:
TypeError: 'int' object is not iterable
But it's not working.
However, I can do this:
df_prob_wc.apply(lambda x: x * np.asarray([1,2,3,4,5,6,7,8,9,10,11,12,13]), axis=1)
What am I doing wrong here?
It seems you need multiple by Series created with df_prob_c by iloc:
df_prob_wc = pd.DataFrame({'A':[1,2,3],
'B':[4,5,6],
'C':[7,8,9],
'D':[1,3,5],
'E':[5,3,6],
'F':[7,4,3]})
print (df_prob_wc)
A B C D E F
0 1 4 7 1 5 7
1 2 5 8 3 3 4
2 3 6 9 5 6 3
df_prob_c = pd.DataFrame([[4,5,6,1,2,3]])
#for align data same columns in both df
df_prob_c.index = df_prob_wc.columns
print (df_prob_c)
0
A 4
B 5
C 6
D 1
E 2
F 3
print (df_prob_wc.shape)
(3, 6)
print (df_prob_c.shape)
(6, 1)
print (df_prob_c.iloc[:,0])
A 4
B 5
C 6
D 1
E 2
F 3
Name: 0, dtype: int64
print (df_prob_wc.mul(df_prob_c.iloc[:,0], axis=1))
A B C D E F
0 4 20 42 1 10 21
1 8 25 48 3 6 12
2 12 30 54 5 12 9
Another solution is multiple by numpy array, only need [:,0] for select:
print (df_prob_wc.mul(df_prob_c.values[:,0], axis=1))
A B C D E F
0 4 20 42 1 10 21
1 8 25 48 3 6 12
2 12 30 54 5 12 9
And another solution with DataFrame.squeeze:
print (df_prob_wc.mul(df_prob_c.squeeze(), axis=1))
A B C D E F
0 4 20 42 1 10 21
1 8 25 48 3 6 12
2 12 30 54 5 12 9