How to select pandas row(s) which has attributes column value equals to any one of value from list - pandas

data = {
"name": ["abc", "xyz", "pqr"],
"attributes": [["attr2", "attr3"], ["attr2","attr4"], ["attr3", "attr1"] ]
}
df = pd.DataFrame.from_dict(data)
How do i filter rows which satisfies this condition:
select row if it's attributes column contains values any of "attr1" or "attr3"
expected output is:
name attributes
0 "abc" ["attr2", "attr3"]
1 "pqr" ["attr3", "attr1"]

Using
df[pd.DataFrame(df.attributes.tolist()).isin(['attr1','attr3']).any(1)]
Out[295]:
attributes name
0 [attr2, attr3] abc
2 [attr3, attr1] pqr

To get a boolean indexer,
>>> idx = df['attributes'].map(lambda l: any(s in l for s in ['attr1', 'attr3']))
>>> idx
0 True
1 False
2 True
Name: attributes, dtype: bool
Then
>>> df.loc[idx]
name attributes
0 abc [attr2, attr3]
2 pqr [attr3, attr1]
Whether you want to reset the index afterward is up to you.

Related

Allocate values between two pandas dataframes

Consider two dataframes:
>> import pandas as pd
>> df1 = pd.DataFrame({"category": ["foo", "foo", "bar", "bar", "bar"], "quantity": [1,2,1,2,3]})
>> print(df1)
category quantity
0 foo 1
1 foo 2
2 bar 1
3 bar 2
4 bar 3
>> df2 = pd.DataFrame({
"category": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "bar", "bar"],
"item": ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
})
>> print(df2)
category item
0 foo A
1 foo B
2 foo C
3 foo D
4 bar E
5 bar F
6 bar G
7 bar H
8 bar I
9 bar J
How can I create a new column in df1 (new dataframe called df3) which joins on category column of df1 and allocates the item column in df2. So, create something like:
>> df3 = pd.DataFrame({
"category": ["foo", "foo", "bar", "bar", "bar"],
"quantity": [1,2,1,2,3],
"item": ["A", "B,C", "E", "F,G", "H,I,J"]
})
category quantity item
0 foo 1 A
1 foo 2 B,C
2 bar 1 E
3 bar 2 F,G
4 bar 3 H,I,J
You can create helper DataFrame by repeat rows by quantity column by Index.repeat with DataFrame.loc, convert index to column for avoid lost indices and create helper column g in both DataFrames for merging by duplicated categories by GroupBy.cumcount, then use DataFrame.merge with aggregate join:
df11 = (df1.loc[df1.index.repeat(df1['quantity'])].reset_index()
.assign(g = lambda x: x.groupby('category').cumcount()))
df22 = df2.assign(g = df2.groupby('category').cumcount())
df = (df11.merge(df22, on=['g','category'], how='left')
.groupby(['index','category','quantity'])['item']
.agg(lambda x: ','.join(x.dropna()))
.droplevel(0)
.reset_index())
print (df)
category quantity item
0 foo 1 A
1 foo 2 B,C
2 bar 1 E
3 bar 2 F,G
4 bar 3 H,I,J
You can use an iterator with itertools.islice:
from itertools import islice
# aggregate the items as iterator
s = df2.groupby('category')['item'].agg(iter)
# for each category, allocate as many items as needed and join
df1['item'] = (df1.groupby('category', group_keys=False)['quantity']
.apply(lambda g:
g.map(lambda x: ','.join(list(islice(s[g.name], x)))))
)
Output:
category quantity item
0 foo 1 A
1 foo 2 B,C
2 bar 1 E
3 bar 2 F,G
4 bar 3 H,I,J
Note that if you don't have enough items, this will just use what is available.
Example using df2 truncated after F as input:
category quantity item
0 foo 1 A
1 foo 2 B,C
2 bar 1 E
3 bar 2 F
4 bar 3
def function1(dd:pd.DataFrame):
col2=dd.quantity.cumsum()
col1=col2.shift(fill_value=0)
return dd.assign(col1=col1,col2=col2).apply(lambda ss:",".join(
df2.loc[df2.category==ss.category,"item"].iloc[ss.col1:ss.col2].tolist()
),axis=1)
df1.assign(item=df1.groupby('category').apply(function1).droplevel(0))
out
category quantity item
0 foo 1 A
1 foo 2 B,C
2 bar 1 E
3 bar 2 F,G
4 bar 3 H,I,J

How to update a pandas column

Given the following dataframe
col1 col2
1 ('A->B', 'B->C')
2 ('A->D', 'D->C', 'C->F')
3 ('A->K', 'K->M', 'M->P')
...
I want to convert this to the following format
col1 col2
1 'A-B-C'
2 'A-D-C-F'
3 'A-K-M-P'
...
Each sequence shows an arc within a path. Hence, the sequence is like (a,b), (b,c), (c,d) ...
def merge_values(val):
val = [x.split('->') for x in val]
out = []
for char in val:
out.append(char[0])
out.append(val[-1][1])
return '-'.join(out)
df['col2'] = df['col2'].apply(merge_values)
print(df)
Output:
col1 col2
0 1 A-B-C
1 2 A-D-C-F
2 3 A-K-M-P
Given
df = pd.DataFrame({
'col1': [1, 2, 3],
'col2': [
('A->B', 'B->C'),
('A->D', 'D->C', 'C->F'),
('A->K', 'K->M', 'M->P'),
],
})
You can do:
def combine(t, old_sep='->', new_sep='-'):
if not t: return ''
if type(t) == str: t = [t]
tokens = [x.partition(old_sep)[0] for x in t]
tokens += t[-1].partition(old_sep)[-1]
return new_sep.join(tokens)
df['col2'] = df['col2'].apply(combine)

Build a decision Column by ANDing multiple columns in pandas

I have a pandas data frame which is shown below:
>>> x = [[1,2,3,4,5],[1,2,4,4,3],[2,4,5,6,7]]
>>> columns = ['a','b','c','d','e']
>>> df = pd.DataFrame(data = x, columns = columns)
>>> df
a b c d e
0 1 2 3 4 5
1 1 2 4 4 3
2 2 4 5 6 7
I have an array of objects (conditions) as shown below:
[
{
'header' : 'a',
'condition' : '==',
'values' : [1]
},
{
'header' : 'b',
'condition' : '==',
'values' : [2]
},
...
]
and an assignHeader which is:
assignHeader = decision
now I want to do an operation which builds up all the conditions from the conditions array by looping through it, for example something like this:
pConditions = []
for eachCondition in conditions:
header = eachCondition['header']
values = eachCondition['values']
if eachCondition['condition'] == "==":
pConditions.append(df[header].isin(values))
else:
pConditions.append(~df[header].isin(values))
df[assignHeader ] = and(pConditions)
I was thinking of using all operator in pandas but am unable to crack the right syntax to do so. The list I shared can go big and dynamic and so I want to use this nested approach and check for the equality. Does anyone know a way to do so?
Final Output:
conditons = [df['a']==1,df['b']==2]
>>> df['decision'] = (df['a']==1) & (df['b']==2)
>>> df
a b c d e decision
0 1 2 3 4 5 True
1 1 2 4 4 3 True
2 2 4 5 6 7 False
Here conditions array will be variable. And I want to have a function which takes df, 'newheadernameandconditions` as input and returns the output as shown below:
>>> df
a b c d e decision
0 1 2 3 4 5 True
1 1 2 4 4 3 True
2 2 4 5 6 7 False
where newheadername = 'decision'
I was able to solve the problem using the code shown below. I am not sure if this is kind of fast way of getting things done, but would love to know your inputs in case you have any specific thing to point out.
def andMerging(conditions, mergeHeader, df):
if len(conditions) != 0:
df[mergeHeader] = pd.concat(conditions, axis = 1).all(axis = 1)
return df
where conditions are an array of pd.Series with boolean values.
And conditions are formatted as shown below:
def prepareForConditionMerging(conditionsArray, df):
conditions = []
for prop in conditionsArray:
condition = prop['condition']
values = prop['values']
header = prop['header']
if type(values) == str:
values = [values]
if condition=="==":
conditions.append(df[header].isin(values))
else:
conditions.append(~df[header].isin(values))
# Here we can add more conditions such as greater than less than etc.
return conditions

Pandas | Filter DF rows with Integers that lie between two Integer values in another Dataframe

I got two Dataframes. The goal is to filter out rows in DF1 that have an Integer value that lies between any of the Integers in the ["Begin"] and ["End"] columns in any of the 37 rows in DF2.
DF1:
INDEX String IntValues
1 "string" 808091
2 "string" 1168262
3 "string" 1169294
... ... ...
647 "string" 14193661
648 "string" 14551918
DF2:
Index Begin End
1 1196482.2 1216529
2 1791819.7 1834887
3 2008405.1 2014344
... ... ...
36 14168540.0 14193933
37 14727507.1 14779605
I think it is possible to use something like :
df1[(df1["IntValues"] >=1196482.2 ) & (df1["IntValues"] <= 1216529),(... 36 more conditions)].
Is there a better way than just writing down these 37 conditions, like a variable for the begin and end values of that "filter window" ?
Edit: As requested a code sample, not from the original DF, but i hope it suffices.
d1 = {
"string":["String0", "String1", "String2", "String3", "String4", "String5", "String6", "String7", "String8", "String9", "String10", "String11", "String12", "String13", "String14"],
"timestamp":[1168262,1169294, 1184451, 1210449,1210543,1210607, 1644328,
1665732, 1694388,1817309,1822872,1825310,2093796,2182923,2209252 ],
"should be in": ["Yes", "Yes", "Yes", "No", "No","No", "yes","yes","yes","no","no", "no","yes","yes","no"]
}
df1 = pd.DataFrame(d1)
d2={
'begin' : [1196482.2,1791819.7,2199564.6],
'end' : [1216529,1834887,2212352]
}
df2 = pd.DataFrame(d2)
Try this:
df_final=[]
for i,j in zip(df2["Begin"],df2["End"]):
x=df1[(df1["IntValues"] >=i ) & (df1["IntValues"] <= j)]
df_final.append(x)
df_final=pd.concat(df_final,axis=0).reset_index(drop=True)
df_final=df_final.drop_duplicates()

Pandas add a summary column that counts values that are not empty strings

I have a table that looks like this:
A B C
1 foo
2 foobar blah
3
I want to count up the non empty columns from A, B and C to get a summary column like this:
A B C sum
1 foo 1
2 foobar blah 2
3 0
Here is how I'm trying to do it:
import pandas as pd
df = { 'A' : ["foo", "foobar", ""],
'B' : ["", "blah", ""],
'C' : ["","",""]}
df = pd.DataFrame(df)
print(df)
df['sum'] = df[['A', 'B', 'C']].notnull().sum(axis=1)
df['sum'] = (df[['A', 'B', 'C']] != "").sum(axis=1)
These last two lines are different ways to get what I want but they aren't working. Any suggestions?
df['sum'] = (df[['A', 'B', 'C']] != "").sum(axis=1)
Worked. Thanks for the assistance.
This one-liner worked for me :)
df["sum"] = df.replace("", np.nan).T.count().reset_index().iloc[:,1]