pandas agg to concat strings in some columns while keeping max value in others - pandas

df = pd.DataFrame({'ID1' : ['A' , 'A', 'B'],
'ID2' : ['C' , 'D', 'E'],
'bool' : [True, True, False]})
df_agg = df.groupby('ID1').agg(lambda x: ';'.join(set(x))).reset_index()
bool_col = df.drop_duplicates(subset=['ID1'])[['bool']].reset_index(drop=True)
final_df = pd.concat([df_agg, bool_col], axis=1)
I want to string concat ID2 when ID1 is duplicated, bot only want to keep the largest value (True) for col bool. I almost have it here, but there has to be a better way

You can pass agg with dict
out = df.groupby('ID1',as_index=False).agg({'ID2': lambda x : ','.join(set(x)),'bool' : 'last'})
Out[322]:
ID1 ID2 bool
0 A C,D True
1 B E False

You're searching for the largest value of bool column; so i'll go with this approach:
df1 = df.groupby('ID1').agg({
'ID2': lambda x: ','.join(set(x)),
'bool': 'max'
}).reset_index()
print(df1)
Output:
ID1 ID2 bool
0 A C,D True
1 B E False

Related

How to update a pandas column

Given the following dataframe
col1 col2
1 ('A->B', 'B->C')
2 ('A->D', 'D->C', 'C->F')
3 ('A->K', 'K->M', 'M->P')
...
I want to convert this to the following format
col1 col2
1 'A-B-C'
2 'A-D-C-F'
3 'A-K-M-P'
...
Each sequence shows an arc within a path. Hence, the sequence is like (a,b), (b,c), (c,d) ...
def merge_values(val):
val = [x.split('->') for x in val]
out = []
for char in val:
out.append(char[0])
out.append(val[-1][1])
return '-'.join(out)
df['col2'] = df['col2'].apply(merge_values)
print(df)
Output:
col1 col2
0 1 A-B-C
1 2 A-D-C-F
2 3 A-K-M-P
Given
df = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [
('A->B', 'B->C'),
('A->D', 'D->C', 'C->F'),
('A->K', 'K->M', 'M->P'),
],
})
You can do:
def combine(t, old_sep='->', new_sep='-'):
if not t: return ''
if type(t) == str: t = [t]
tokens = [x.partition(old_sep)[0] for x in t]
tokens += t[-1].partition(old_sep)[-1]
return new_sep.join(tokens)
df['col2'] = df['col2'].apply(combine)

How to modify dataframe based on column values

I want to add relationships to column 'relations' based on rel_list. Specifically, for each tuple, i.e. ('a', 'b'), I want to replace the relationships column value '' with 'b' in the first row, but no duplicate, meaning that for the 2nd row, don't replace '' with 'a', since they are considered as duplicated. The following code doesn't work fully correct:
import pandas as pd
data = {
"names": ['a', 'b', 'c', 'd'],
"ages": [50, 40, 45, 20],
"relations": ['', '', '', '']
}
rel_list = [('a', 'b'), ('a', 'c'), ('c', 'd')]
df = pd.DataFrame(data)
for rel_tuple in rel_list:
head = rel_tuple[0]
tail = rel_tuple[1]
df.loc[df.names == head, 'relations'] = tail
print(df)
The current result of df is:
names ages relations
0 a 50 c
1 b 40
2 c 45 d
3 d 20
However, the correct one is:
names ages relations
0 a 50 b
0 a 50 c
1 b 40
2 c 45 d
3 d 20
There are new rows that need to be added. The 2nd row in this case, like above. How to do that?
You can craft a dataframe and merge:
(df.drop('relations', axis=1)
.merge(pd.DataFrame(rel_list, columns=['names', 'relations']),
on='names',
how='outer'
)
# .fillna('') # uncomment to replace NaN with empty string
)
Output:
names ages relations
0 a 50 b
1 a 50 c
2 b 40 NaN
3 c 45 d
4 d 20 NaN
Instead of updating df you can create a new one and add relations row by row:
import pandas as pd
data = {
"names": ['a', 'b', 'c', 'd'],
"ages": [50, 40, 45, 20],
"relations": ['', '', '', '']
}
rel_list = [('a', 'b'), ('a', 'c'), ('c', 'd')]
df = pd.DataFrame(data)
new_df = pd.DataFrame(data)
new_df.loc[:, 'relations'] = ''
for head, tail in rel_list:
new_row = df[df.names == head]
new_row.loc[:,'relations'] = tail
new_df = new_df.append(new_row)
print(new_df)
Output:
names ages relations
0 a 50
1 b 40
2 c 45
3 d 20
0 a 50 b
0 a 50 c
2 c 45 d
Then, if needed, in the end you can delete all rows without value in 'relations':
new_df = new_df[new_df['relations']!='']

Pandas, groupby include number of rows grouped in each row

Have any way to use
df = pd.read_excel(r'a.xlsx')
df2 = df.groupby(by=["col"], as_index=False).mean()
Include new column with number of rows grouped in each row?
in absence of sample data, I'm assuming you have multiple numeric columns
can use apply() to then calculate all means and append len() to this series
df = pd.DataFrame(
{
"col": np.random.choice(list("ABCD"), 200),
"val": np.random.uniform(1, 5, 200),
"val2": np.random.uniform(5, 10, 200),
}
)
df2 = df.groupby(by=["col"], as_index=False).apply(
lambda d: d.select_dtypes("number").mean().append(pd.Series({"len": len(d)}))
)
df2
col
val
val2
len
0
A
3.13064
7.63837
42
1
B
3.1057
7.50656
44
2
C
3.0111
7.82628
54
3
D
3.20709
7.32217
60
comment code
def w_avg(df, values, weights, exp):
d = df[values]
w = df[weights] ** exp
return (d * w).sum() / w.sum()
dfg1 = pd.DataFrame(
{
"Jogador": np.random.choice(list("ABCD"), 200),
"Evento": np.random.choice(list("XYZ"),200),
"Rating Calculado BW": np.random.uniform(1, 5, 200),
"Lances": np.random.uniform(5, 10, 200),
}
)
dfg = dfg1.groupby(by=["Jogador", "Evento"]).apply(
lambda dfg1: dfg1.select_dtypes("number")
.agg(lambda d: w_avg(dfg1, "Rating Calculado BW", "Lances", 1))
.append(pd.Series({"len": len(dfg1)}))
)
dfg

Pandas add a summary column that counts values that are not empty strings

I have a table that looks like this:
A B C
1 foo
2 foobar blah
3
I want to count up the non empty columns from A, B and C to get a summary column like this:
A B C sum
1 foo 1
2 foobar blah 2
3 0
Here is how I'm trying to do it:
import pandas as pd
df = { 'A' : ["foo", "foobar", ""],
'B' : ["", "blah", ""],
'C' : ["","",""]}
df = pd.DataFrame(df)
print(df)
df['sum'] = df[['A', 'B', 'C']].notnull().sum(axis=1)
df['sum'] = (df[['A', 'B', 'C']] != "").sum(axis=1)
These last two lines are different ways to get what I want but they aren't working. Any suggestions?
df['sum'] = (df[['A', 'B', 'C']] != "").sum(axis=1)
Worked. Thanks for the assistance.
This one-liner worked for me :)
df["sum"] = df.replace("", np.nan).T.count().reset_index().iloc[:,1]

How to select pandas row(s) which has attributes column value equals to any one of value from list

data = {
"name": ["abc", "xyz", "pqr"],
"attributes": [["attr2", "attr3"], ["attr2","attr4"], ["attr3", "attr1"] ]
}
df = pd.DataFrame.from_dict(data)
How do i filter rows which satisfies this condition:
select row if it's attributes column contains values any of "attr1" or "attr3"
expected output is:
name attributes
0 "abc" ["attr2", "attr3"]
1 "pqr" ["attr3", "attr1"]
Using
df[pd.DataFrame(df.attributes.tolist()).isin(['attr1','attr3']).any(1)]
Out[295]:
attributes name
0 [attr2, attr3] abc
2 [attr3, attr1] pqr
To get a boolean indexer,
>>> idx = df['attributes'].map(lambda l: any(s in l for s in ['attr1', 'attr3']))
>>> idx
0 True
1 False
2 True
Name: attributes, dtype: bool
Then
>>> df.loc[idx]
name attributes
0 abc [attr2, attr3]
2 pqr [attr3, attr1]
Whether you want to reset the index afterward is up to you.