Pandas boolean condition from nested list of dictionaries - pandas

[{'id': 123,
'type': 'salary', #Parent node
'tx': 'house',
'sector': 'EU',
'transition': [{'id': 'hash', #Child node
'id': 123,
'type': 'salary',
'tx': 'house' }]},
{'userid': 123,
'type': 'salary', #Parent node
'tx': 'office',
'transition': [{'id': 'hash', # Child node
'id': 123,
'type': 'salary',
'tx': 'office'}]}]
As a pandas column ('info') I have some information stored as a nested list of dictionaries like the example above.
What I'm trying to do is a boolean condition whether this list has the following attributes:
More than one 'type' == 'salary' in any of all parents nodes
Field 'tx' is different in any of all parents nodes with 'type' == 'salary'
So far I've tried to flatten a list and filter but it is not solving the first and seconds nodes
a = df.iloc[0].info
values = [item for sublist in [[list(i.values()) for i in a]][0]for item in sublist]

If you want to one line solution, you can use:
df['check'] = df['info'].apply(lambda x: True if sum([1 if i['type']=='salary' else 0 for i in x]) > 1 and [i['tx'] for i in x if i['type']=='salary'].count([i['tx'] for i in x if i['type']=='salary'][0]) != len([i['tx'] for i in x if i['type']=='salary']) else False)
or (expanded):
def check(x):
total_salary = sum([1 if i['type']=='salary' else 0 for i in x]) # get count of "type": "salary" matches
tx_list = [i['tx'] for i in x if i['type']=='salary'] # get tx values when type==salary
tx_check = tx_list.count(tx_list[0]) != len(tx_list) # check all values are same in tx_list
if total_salary > 1 and tx_check:
return True
else:
return False
df['check'] = df['info'].apply(check)

Related

python complex list object to dataframe

I wanted to create a dataframe by expanding the child list object along with parent objects.
Obviously trying pd.DataFrame(lst) does not work as it creates data frame with three columns only and keeps the child object as one column.
Is it possible to do this in one line instead of iterating through list to expand each child object? Thank you in advance.
I have a list object in python like this:
lst = [
{
'id': 'rec1',
'fields': {
'iso': 'US',
'name': 'U S',
'lat': '38.9051',
'lon': '-77.0162'
},
'createdTime': '2021-03-16T13:03:24.000Z'
},
{
'id': 'rec2',
'fields': {'iso': 'HK', 'name': 'China', 'lat': '0.0', 'lon': '0.0'},
'createdTime': '2021-03-16T13:03:24.000Z'
}
]
explected dataframe:
Use json_normalize:
df = pd.json_normalize(lst)
print (df)
id createdTime fields.iso fields.name fields.lat fields.lon
0 rec1 2021-03-16T13:03:24.000Z US U S 38.9051 -77.0162
1 rec2 2021-03-16T13:03:24.000Z HK China 0.0 0.0

How to merge pandas DF on imperfect match?

I'm trying to merge/join x and y dataframes based on an exact match of the company columns and a partial match of some degree on the name columns.
Other than looking at the values returned by SequenceMatcher(None, x_name, y_name).ratio(), which were always above .8 in my case, I haven't tried much that warrants mentioning.
x = pd.DataFrame([{'id': 1, 'name': 'Robert Jackson', 'company': 'Test inc.', 'tenure': 6},
{'id': 2, 'name': 'William Johnson', 'company': 'Test inc.', 'tenure': 6}]).set_index('id')
y = pd.DataFrame([{'id': 4, 'name': 'Bob Jackson', 'company': 'Test inc.', 'job': 'desk'},
{'id': 5, 'name': 'Willy Johnson', 'company': 'Test inc.', 'job': 'desk'}]).set_index('id')
goal = pd.DataFrame([{'x_id': 1, 'y_id': 4, 'x_name': 'Robert Jackson', 'y_name': 'Bob Jackson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'},
{'x_id': 2, 'y_id': 5, 'x_name': 'William Johnson', 'y_name': 'Willy Johnson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'}])
Is something like this plausible? I'd appreciate any feedback, thank you.
Great question! I'm following to see other answers as I've been doing a lot of similar work lately. One inefficient method I've taken is to use fuzzywuzzy based on a threshold.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1
The solution that I used was:
from difflib import SequenceMatcher
x['merge_name'] = x['name']
x['merge_comp'] = x['company']
for a, b in x[['name', 'company']].values:
for ixb, (c,d) in enumerate(y[['name', 'company']].values):
if SequenceMatcher(None,a,c).ratio() >= .8:
y.loc[ixb,'merge_name'] = a
if SequenceMatcher(None,b,d).ratio() == 1:
y.loc[ixb,'merge_comp'] = b
goal = pd.merge(x,y, on=['merge_name', 'merge_comp'])
This function worked while passing arbitrary number of columns:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=False, post_drop=True):
if reset_index:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
flag = 0
merge_columns = []
r = len(columns)
for f in range(r):
df1[prefix + columns[flag]] = df1[columns[flag]]
merge_columns.append(prefix + columns[flag])
flag =+ 1
flag = 0
for f in range(r):
for col_1 in df1[columns[flag]].values:
for index, col_2 in enumerate(df2[columns[flag]].values):
print(type(col_2))
if SequenceMatcher(None,str(col_1),str(col_2)).ratio() >= ratios[flag]:
df2.loc[index, merge_columns[flag]] = col_1
flag =+ 1
df = pd.merge(df1,df2, on=merge_columns)
if post_drop:
df1.drop(columns=merge_columns, inplace=True)
df2.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x, y, columns=['name', 'company'], ratios=[.8, 1], reset_index=True)
This function worked for passing exactly 2 columns/ratios:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=True, post_drop=True):
df1_c = df1.copy()
df2_c = df2.copy()
if reset_index:
df1_c.reset_index(inplace=True)
df2_c.reset_index(inplace=True)
df1_c[prefix + columns[0]] = df1_c[columns[0]]
df1_c[prefix + columns[1]] = df1_c[columns[1]]
merge_columns = [prefix + columns[0], prefix + columns[1]]
for col_1, col_2 in df1_c[[columns[0], columns[1]]].values:
for index, (col_3, col_4) in enumerate(df2_c[[columns[0], columns[1]]].values):
if SequenceMatcher(None, str(col_1), str(col_3)).ratio() >= ratios[0]:
df2_c.loc[index, merge_columns[0]] = col_1
if SequenceMatcher(None, str(col_2), str(col_4)).ratio() >= ratios[1]:
df2_c.loc[index, merge_columns[1]] = col_2
df = pd.merge(df1_c, df2_c, on=merge_columns)
if post_drop:
df.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x,y,columns=['name', 'company'], ratios=[.8,1])

Conditional mapping among columns of two data frames with Pandas Data frame

I needed your advice regarding how to map columns between data-frames:
I have put it in simple way so that it's easier for you to understand:
df = dataframe
EXAMPLE:
df1 = pd.DataFrame({
"X": [],
"Y": [],
"Z": []
})
df2 = pd.DataFrame({
"A": ['', '', 'A1'],
"C": ['', '', 'C1'],
"D": ['D1', 'Other', 'D3'],
"F": ['', '', ''],
"G": ['G1', '', 'G3'],
"H": ['H1', 'H2', 'H3']
})
Requirement:
1st step:
We needed to track a value for X column on df1 from columns A, C, D respectively. It would stop searching once it finds any value and would select it.
2nd step:
If the selected value is "Other" then X column of df1 would map columns F, G, and H respectively until it finds any value.
Result:
X
0 D1
1 H2
2 A1
Thank you so much in advance
Try this:
def first_non_empty(df, cols):
"""Return the first non-empty, non-null value among the specified columns per row"""
return df[cols].replace('', pd.NA).bfill(axis=1).iloc[:, 0]
col_x = first_non_empty(df2, ['A','C','D'])
col_x = col_x.mask(col_x == 'Other', first_non_empty(df2, ['F','G','H']))
df1['X'] = col_x

Collapsing a PANDAs dataframe into a single column of all items and their occurances

I have a data frame consisting of a mixture of NaN's and strings e.g
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
ddf = pd.DataFrame(data)
I want to
1:count the total number of items and put in a new data frame e.g
NaN=2
tree=5
car=2
fish=1
cat=1
dog=1
2:Count the total number of items when compared to a separate longer list (column of a another data frame, e.g
df['compare'] =
NaN
tree
car
fish
cat
dog
rabbit
Pear
Orange
snow
rain
Thanks
Jason
For the first question:
from collections import Counter
data = {
"String1": ["NaN", "tree", "car", "tree"],
"String2": ["cat", "dog", "car", "tree"],
"String3": ["fish", "tree", "NaN", "tree"],
}
ddf = pd.DataFrame(data)
a = Counter(ddf.stack().tolist())
df_result = pd.DataFrame(dict(a), index=['Count']).T
df = pd.DataFrame({'vals':['NaN', 'tree', 'car', 'fish', 'cat', 'dog', 'rabbit', 'Pear', 'Orange', 'snow', 'rain']})
df_counts = df.vals.map(df_result.to_dict()['Count'])
THis should do :)
You can use the following code for count of items over all data frame.
import pandas as pd
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
df = pd.DataFrame(data)
def get_counts(df: pd.DataFrame) -> dict:
res = {}
for col in df.columns:
vc = df[col].value_counts().to_dict()
for k,v in vc.items():
if k in res:
res[k] += v
else:
res[k] = v
return res
counts = get_counts(df)
Output
>>> print(counts)
{'tree': 5, 'car': 2, 'NaN': 2, 'cat': 1, 'dog': 1, 'fish': 1}

Pandas - Filtering out column based on value

I have a Pandas Dataframe that two columns as below (view with header):
name,attribute
abc,{'attributes': {'type': 'RecordType', 'url': '/services/data/v38.0/sobjects/RecordType/000xyz'}, 'Name': 'Product 1'}
def,{'attributes': {'type': 'RecordType', 'url': '/services/data/v38.0/sobjects/RecordType/000abc'}, 'Name': 'Product 2'}
klm,{'attributes': {'type': 'RecordType', 'url': '/services/data/v38.0/sobjects/RecordType/000abc'}, 'Name': 'Product 2'}
How could I filter out rows that have attribute like 'Product 1'
Could anyone assist, thanks
Use list comprehension with get for working with rows also if not exist key Name in some row for boolean mask and filter by boolean indexing:
df = df[[x.get('Name') == 'Product 1' for x in df['attribute']]]
Or:
df = df[df['attribute'].apply(lambda x: x.get('Name')) == 'Product 1']
#alternative, working if all Name exist in each row
#df = df[df['attribute'].apply(lambda x: x['Name']) == 'Product 1']
print (df)
name attribute
0 abc {'attributes': {'type': 'RecordType', 'url': '...
EDIT:
If want also filter by nested dictionaries:
df = df[[x.get('attributes').get('type') == 'RecordType' for x in df['attribute']]]