How to merge pandas DF on imperfect match? - pandas

I'm trying to merge/join x and y dataframes based on an exact match of the company columns and a partial match of some degree on the name columns.
Other than looking at the values returned by SequenceMatcher(None, x_name, y_name).ratio(), which were always above .8 in my case, I haven't tried much that warrants mentioning.
x = pd.DataFrame([{'id': 1, 'name': 'Robert Jackson', 'company': 'Test inc.', 'tenure': 6},
{'id': 2, 'name': 'William Johnson', 'company': 'Test inc.', 'tenure': 6}]).set_index('id')
y = pd.DataFrame([{'id': 4, 'name': 'Bob Jackson', 'company': 'Test inc.', 'job': 'desk'},
{'id': 5, 'name': 'Willy Johnson', 'company': 'Test inc.', 'job': 'desk'}]).set_index('id')
goal = pd.DataFrame([{'x_id': 1, 'y_id': 4, 'x_name': 'Robert Jackson', 'y_name': 'Bob Jackson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'},
{'x_id': 2, 'y_id': 5, 'x_name': 'William Johnson', 'y_name': 'Willy Johnson', 'company': 'Test inc.', 'tenure': 6, 'job': 'desk'}])
Is something like this plausible? I'd appreciate any feedback, thank you.

Great question! I'm following to see other answers as I've been doing a lot of similar work lately. One inefficient method I've taken is to use fuzzywuzzy based on a threshold.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1

The solution that I used was:
from difflib import SequenceMatcher
x['merge_name'] = x['name']
x['merge_comp'] = x['company']
for a, b in x[['name', 'company']].values:
for ixb, (c,d) in enumerate(y[['name', 'company']].values):
if SequenceMatcher(None,a,c).ratio() >= .8:
y.loc[ixb,'merge_name'] = a
if SequenceMatcher(None,b,d).ratio() == 1:
y.loc[ixb,'merge_comp'] = b
goal = pd.merge(x,y, on=['merge_name', 'merge_comp'])
This function worked while passing arbitrary number of columns:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=False, post_drop=True):
if reset_index:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
flag = 0
merge_columns = []
r = len(columns)
for f in range(r):
df1[prefix + columns[flag]] = df1[columns[flag]]
merge_columns.append(prefix + columns[flag])
flag =+ 1
flag = 0
for f in range(r):
for col_1 in df1[columns[flag]].values:
for index, col_2 in enumerate(df2[columns[flag]].values):
print(type(col_2))
if SequenceMatcher(None,str(col_1),str(col_2)).ratio() >= ratios[flag]:
df2.loc[index, merge_columns[flag]] = col_1
flag =+ 1
df = pd.merge(df1,df2, on=merge_columns)
if post_drop:
df1.drop(columns=merge_columns, inplace=True)
df2.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x, y, columns=['name', 'company'], ratios=[.8, 1], reset_index=True)
This function worked for passing exactly 2 columns/ratios:
def sm_merge(df1, df2, columns=[], ratios=[], prefix='m_', reset_index=True, post_drop=True):
df1_c = df1.copy()
df2_c = df2.copy()
if reset_index:
df1_c.reset_index(inplace=True)
df2_c.reset_index(inplace=True)
df1_c[prefix + columns[0]] = df1_c[columns[0]]
df1_c[prefix + columns[1]] = df1_c[columns[1]]
merge_columns = [prefix + columns[0], prefix + columns[1]]
for col_1, col_2 in df1_c[[columns[0], columns[1]]].values:
for index, (col_3, col_4) in enumerate(df2_c[[columns[0], columns[1]]].values):
if SequenceMatcher(None, str(col_1), str(col_3)).ratio() >= ratios[0]:
df2_c.loc[index, merge_columns[0]] = col_1
if SequenceMatcher(None, str(col_2), str(col_4)).ratio() >= ratios[1]:
df2_c.loc[index, merge_columns[1]] = col_2
df = pd.merge(df1_c, df2_c, on=merge_columns)
if post_drop:
df.drop(columns=merge_columns, inplace=True)
return df
sm_merge(x,y,columns=['name', 'company'], ratios=[.8,1])

Related

Collapsing a PANDAs dataframe into a single column of all items and their occurances

I have a data frame consisting of a mixture of NaN's and strings e.g
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
ddf = pd.DataFrame(data)
I want to
1:count the total number of items and put in a new data frame e.g
NaN=2
tree=5
car=2
fish=1
cat=1
dog=1
2:Count the total number of items when compared to a separate longer list (column of a another data frame, e.g
df['compare'] =
NaN
tree
car
fish
cat
dog
rabbit
Pear
Orange
snow
rain
Thanks
Jason
For the first question:
from collections import Counter
data = {
"String1": ["NaN", "tree", "car", "tree"],
"String2": ["cat", "dog", "car", "tree"],
"String3": ["fish", "tree", "NaN", "tree"],
}
ddf = pd.DataFrame(data)
a = Counter(ddf.stack().tolist())
df_result = pd.DataFrame(dict(a), index=['Count']).T
df = pd.DataFrame({'vals':['NaN', 'tree', 'car', 'fish', 'cat', 'dog', 'rabbit', 'Pear', 'Orange', 'snow', 'rain']})
df_counts = df.vals.map(df_result.to_dict()['Count'])
THis should do :)
You can use the following code for count of items over all data frame.
import pandas as pd
data = {'String1':['NaN', 'tree', 'car', 'tree'],
'String2':['cat','dog','car','tree'],
'String3':['fish','tree','NaN','tree']}
df = pd.DataFrame(data)
def get_counts(df: pd.DataFrame) -> dict:
res = {}
for col in df.columns:
vc = df[col].value_counts().to_dict()
for k,v in vc.items():
if k in res:
res[k] += v
else:
res[k] = v
return res
counts = get_counts(df)
Output
>>> print(counts)
{'tree': 5, 'car': 2, 'NaN': 2, 'cat': 1, 'dog': 1, 'fish': 1}

How to split every string in a list in a dataframe column

I have a dataframe with a column containing a list of strings 'A:B'. I'd like to modify this so there is a new column which contains a set split by ':' containing the first element.
data = [
{'Name': 'A', 'Servers':['A:s1', 'B:s2', 'C:s3', 'C:s2']},
{'Name': 'B', 'Servers':['B:s1', 'C:s2', 'B:s3', 'A:s2']},
{'Name': 'C', 'Servers':['G:s1', 'X:s2', 'Y:s3']}
]
df = pd.DataFrame(data)
df
df['Clusters'] = [
{'A', 'B', 'C'},
{'B', 'C', 'A'},
{'G', 'X', 'Y'}
]
Learn how to use apply
In [5]: df['Clusters'] = df['Servers'].apply(lambda x: {p.split(':')[0] for p in x})
In [6]: df
Out[6]:
Name Servers Clusters
0 A [A:s1, B:s2, C:s3, C:s2] {A, B, C}
1 B [B:s1, C:s2, B:s3, A:s2] {C, B, A}
2 C [G:s1, X:s2, Y:s3] {X, Y, G}

Pandas Styling (background + font) based on String data - Is there a better way?

Can i combine the lambda func for background color and the lambda func for font color into a single lamdba func? This will be used for a very large dataframe with plenty of different styling, so it would be nice to reduce the code in half.
Any other suggestions for a better way are welcomed
# raw data
df = pd.DataFrame({'Name':['name1', 'name2', 'name3', 'name1', 'name2', 'name3', 'name1', 'name2', 'name3' ],
'Rotation':['ER','PEDI','MAM','PEDI', 'ERJD','PEDI','JMAM','ERSN','ABD']})
#style
df = df.style.apply(lambda x: ["background-color: green" if 'ER' in v else "" for v in x], axis = 1)\
.apply(lambda x: ["color: orange" if 'ER' in v else "" for v in x], axis = 1)\
.apply(lambda x: ["background-color: red" if 'MAM' in v else "" for v in x], axis = 1)\
.apply(lambda x: ["color: yellow" if 'MAM' in v else "" for v in x], axis = 1)
resulting df shown below:
I'd do something like this (Python 3.6+ for f-strings):
def where(x):
bg = ['green', 'red']
fg = ['orange', 'yellow']
ls = ['ER', 'MAM']
for i, y in enumerate(ls):
if y in x:
return f"background-color: {bg[i]}; color: {fg[i]}"
return ''
df.style.applymap(where)

Merging dataframes by file name

I have multiple files with the following naming convention.
ENCSR000EQO_0_0.txt
ENCSR000DIA_0_0.txt
ENCSR000DIA_1_1.txt
ENCSR000DIA_2_1.txt
ENCSR000DIM_0_0.txt
ENCSR000DIM_1_1.txt
ENCSR000AIB_0_0.txt
ENCSR000AIB_1_1.txt
ENCSR000AIB_2_1.txt
ENCSR000AIB_3_1.txt
I want to merge them as dataframes using pandas according to the file name, so I would have 4 resulting dataframes. And then for each of these 4, I want to groupby the gene(GeneName) column. Since the same gene will appear multiple times.
They all have the same columns in the same order. I can merge all 10 together at once, but I couldn't figure it out how to merge by name.
path = '/renamed/'
print os.listdir(path)
df_merge = None
for fname in os.listdir(path):
if fname.endswith('.txt'):
df = pd.read_csv(path + fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df = df.groupby('GeneName').agg(np.mean)
print df
Thank you for any input.
I would do something more like this, where you can use glob to get the filenames, check each one, and then group the concatenated results.
import glob
path = 'renamed'
df_merge = None
for fid in ('EQO', 'DIA', 'DIM', 'AIB'):
df_ = pd.DataFrame()
for fname in glob.glob(os.path.join(path, '*.txt')):
if fid in fname:
df = pd.read_csv(fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df_ = pd.concat((df_, df))
df_ = df_.groupby('GeneName').agg(np.mean)
print df_
Edit: expanding answer to be more automated.
Based on your filenames you might be able to id them as follows:
import numpy as np
files = glob.glob(os.path.join(path, '*.txt'))
fids = np.unique([file.split('_')[0] for file in files])
Putting it all together the updated code would be this:
import glob
import numpy as np
path = 'renamed'
files = glob.glob(os.path.join(path, '*.txt'))
fids = np.unique([file.split('_')[0] for file in files])
df_merge = None
for fid in fids:
df_ = pd.DataFrame()
for fname in files:
if fid in fname:
df = pd.read_csv(fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df_ = pd.concat((df_, df))
df_ = df_.groupby('GeneName').agg(np.mean)
print df_
try adding the file name as column, append all df's to a list and concat them, then group:
df_merge = []
for fname in os.listdir(path):
if fname.endswith('.txt'):
df = pd.read_csv(path + fname, sep='\t', header=0)
df.columns = ['ID ', 'Chr', 'Start', 'End', 'Strand', 'Peak Score', 'Focus Ratio/Region Size',
'Ann', 'DetAnn', 'Distance', 'PromoterID', 'EID',
'Unigene', 'Refseq', 'Ensembl', 'GeneName', 'GeneAlias',
'GeneDescription', 'GeneType']
df['fname'] = [fname.split('_')[0] for x in df.index] #just to multiple by length
df_merge.append(df)
df_all = pd.concat(df_merge)
for fn in set(df_all['fname'].values):
print df_all[df_all['fname']==fn].groupby('GeneName').agg(np.mean)

Aggregate/Remove duplicate rows in DataFrame based on swapped index levels

Sample input
import pandas as pd
df = pd.DataFrame([
['A', 'B', 1, 5],
['B', 'C', 2, 2],
['B', 'A', 1, 1],
['C', 'B', 1, 3]],
columns=['from', 'to', 'type', 'value'])
df = df.set_index(['from', 'to', 'type'])
Which looks like this:
value
from to type
A B 1 5
B C 2 2
A 1 1
C B 1 3
Goal
I now want to remove "duplicate" rows from this in the following sense: for each row with an arbitrary index (from, to, type), if there exists a row (to, from, type), the value of the second row should be added to the first row and the second row be dropped. In the example above, the row (B, A, 1) with value 1 should be added to the first row and dropped, leading to the following desired result.
Sample result
value
from to type
A B 1 6
B C 2 2
C B 1 3
This is my best try so far. It feels unnecessarily verbose and clunky:
# aggregate val of rows with (from,to,type) == (to,from,type)
df2 = df.reset_index()
df3 = df2.rename(columns={'from':'to', 'to':'from'})
df_both = df.join(df3.set_index(
['from', 'to', 'type']),
rsuffix='_b').sum(axis=1)
# then remove the second, i.e. the (to,from,t) row
rows_to_keep = []
rows_to_remove = []
for a,b,t in df_both.index:
if (b,a,t) in df_both.index and not (b,a,t) in rows_to_keep:
rows_to_keep.append((a,b,t))
rows_to_remove.append((b,a,t))
df_final = df_both.drop(rows_to_remove)
df_final
Especially the second "de-duplication" step feels very unpythonic. (How) can I improve these steps?
Not sure how much better this is, but it's certainly different
import pandas as pd
from collections import Counter
df = pd.DataFrame([
['A', 'B', 1, 5],
['B', 'C', 2, 2],
['B', 'A', 1, 1],
['C', 'B', 1, 3]],
columns=['from', 'to', 'type', 'value'])
df = df.set_index(['from', 'to', 'type'])
ls = df.to_records()
ls = list(ls)
ls2=[]
for l in ls:
i=0
while i <= l[3]:
ls2.append(list(l)[:3])
i+=1
counted = Counter(tuple(sorted(entry)) for entry in ls2)