Mapping selective data with other dataframe in pandas - pandas

I want to map data( df2 & df1 ) with selective columns
import pandas as pd
df_data = [{'id':'1234','task':'data_trasnfer','filename':'orance_bank','date':'17-3-22'},{'id':'234','task':'data2trasnfer','filename':'ftr_data','date':'16-03-2022'},{'id':'4567','task':'data3_transfer','filename':'trnienn_data','date':'15-2-22'}]
df1 = pd.DataFrame(df_data)
df1
id task filename date
0 1234 data_trasnfer orance_bank 17-3-22
1 234 data2trasnfer ftr_data 16-03-2022
2 4567 data3_transfer trnienn_data 15-2-22
df_data1 = [{'target':'ed34','status':'sucess','flow_in':'ntfc_to_pad'},{'target':'der456','status':'error','flow_in':'htr_tokid'}]
df2 = pd.DataFrame(df_data1)
df2
target status flow_in
0 ed34 sucess ntfc_to_pad
1 der456 error htr_tokid
expected output :
df2 data ed34 should map with only with fileaname orance_bank & der456 only map with trnienn_data
id task filename date target status flow_in
0 1234 data_trasnfer orance_bank 17-3-22 ed34 sucess ntfc_to_pad
1 234 data2trasnfer ftr_data 16-03-2022
2 4567 data3_transfer trnienn_data 15-2-22 der456 error htr_tokid

First make a mapping function, like this:
filemap = {
"ed34": "orance_bank",
"der456": "trnienn_data"
}
df2['filename'] = df2['target'].map(filemap)
Then merge the 2 dataframes:
df1.merge(df2, on='filename', how='outer').fillna('')

Related

If column is substring of another dataframe column set value

df1 = pd.DataFrame({'Key':['OK340820.1','OK340821.1'],'Length':[50000,67000]})
df2 = pd.DataFrame({'Key':['OK340820','OK340821'],'Length':[np.nan,np.nan]})
If df2.Key is a substring of df1.Key, set Length of df2 as value of Length in df1
I tried doing this:
df2['Length']=np.where(df2.Key.isin(df1.Key.str.extract(r'(.+?(?=\.))')), df1.Length, '')
But it's not returning the matches.
Map df2.Key to a "prepared" Key values of df1:
df2['Length'] = df2.Key.map(dict(zip(df1.Key.str.replace(r'\..+', '', regex=True), df1.Length)))
In [45]: df2
Out[45]:
Key Length
0 OK340820 50000
1 OK340821 67000
You can use a regex to extract the string, then map the values:
import re
pattern = '|'.join(map(re.escape, df2['Key']))
s = pd.Series(df1['Length'].values, index=df1['Key'].str.extract(f'({pattern})', expand=False))
df2['Length'] = df2['Key'].map(s)
Updated df2:
Key Length
0 OK340820 50000
1 OK340821 67000
Or with a merge:
import re
pattern = '|'.join(map(re.escape, df2['Key']))
(df2.drop(columns='Length')
.merge(df1, how='left', left_on='Key', suffixes=(None, '_'),
right_on=df1['Key'].str.extract(f'({pattern})', expand=False))
.drop(columns='Key_')
)
Alternative if the Key in df1 is always in the form XXX.1 and removing the .1 is enough:
df2['Length'] = df2['Key'].map(df1.set_index(df1['Key'].str.extract('([^.]+)', expand=False))['Length'])
Another possible solution, which is based on pandas.DataFrame.update:
df2.update(df1.assign(Key = df1['Key'].str.extract('(.*)\.')))
Output:
Key Length
0 OK340820 50000.0
1 OK340821 67000.0

grouper day and cumsum speed

I have the following df:
I want to group this df on the first column(ID) and on the second column(key), from there to build a cumsum for each day. The cumsum should be on the last column(speed).
I tried this with the following code :
df = pd.read_csv('df.csv')
df['Time'] = pd.to_datetime(df['Time'], format='%Y-%m-%d %H:%M:%S')
df = df.sort_values(['ID','key'])
grouped = df.groupby(['ID','key'])
test = pd.DataFrame()
test2 = pd.DataFrame()
for name, group in grouped:
test = group.groupby(pd.Grouper(key='Time', freq='1d'))['Speed'].cumsum()
test = test.reset_index()
test['ID'] = ''
test['ID'] = name[0]
test['key'] = ''
test['key'] = name[1]
test2 = test2.append(test)
But the result seem quite off, there are more rows than 5. For each day one row with the cumsum of each ID and key.
Does anyone see the reason for my problem ?
thanks in advance
Friendly reminder, it's useful to include a runable example
import pandas as pd
data = [{"cid":33613,"key":14855,"ts":1550577600000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550579340000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550584800000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550682000000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550685900000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550773380000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550858400000,"value":50.0},
{"cid":33613,"key":14855,"ts":1550941200000,"value":25.0},
{"cid":33613,"key":14855,"ts":1550978400000,"value":50.0}]
df = pd.DataFrame(data)
df['ts'] = pd.to_datetime(df['ts'], unit='ms')
I believe what you need can be accomplished as follows:
df.set_index('ts').groupby(['cid', 'key'])['value'].resample('D').sum().cumsum()
Result:
cid key ts
33613 14855 2019-02-19 150.0
2019-02-20 250.0
2019-02-21 300.0
2019-02-22 350.0
2019-02-23 375.0
2019-02-24 425.0
Name: value, dtype: float64

Join 2 data frame with special columns matching new

I want to join two dataframes and get result as below. I tried many ways, but it fails.
I want only texts on df2 ['A'] which contain text on df1 ['A']. What do I need to change in my code?
I want:
0 A0_link0
1 A1_link1
2 A2_link2
3 A3_link3
import pandas as pd
df1 = pd.DataFrame(
{
"A": ["A0", "A1", "A2", "A3"],
})
df2 = pd.DataFrame(
{ "A": ["A0_link0", "A1_link1", "A2_link2", "A3_link3", "A4_link4", 'An_linkn'],
"B" : ["B0_link0", "B1_link1", "B2_link2", "B3_link3", "B4_link4", 'Bn_linkn']
})
result = pd.concat([df1, df2], ignore_index=True, join= "inner", sort=False)
print(result)
Create an intermediate dataframe and map:
d = (df2.assign(key=df2['A'].str.extract(r'([^_]+)'))
.set_index('key'))
df1['A'].map(d['A'])
Output:
0 A0_link0
1 A1_link1
2 A2_link2
3 A3_link3
Name: A, dtype: object
Or merge if you want several columns from df2 (df1.merge(d, left_on='A', right_index=True))
You can set the index as An and pd.concat on columns
result = (pd.concat([df1.set_index(df1['A']),
df2.set_index(df2['A'].str.split('_').str[0])],
axis=1, join="inner", sort=False)
.reset_index(drop=True))
print(result)
A A B
0 A0 A0_link0 B0_link0
1 A1 A1_link1 B1_link1
2 A2 A2_link2 B2_link2
3 A3 A3_link3 B3_link3
df2.A.loc[df2.A.str.split('_',expand=True).iloc[:,0].isin(df1.A)]
0 A0_link0
1 A1_link1
2 A2_link2
3 A3_link3

Pandas dataframe - column with list of dictionaries, extract values and convert to comma separated values

I have the following dataframe that I want to extract each numerical value from the list of dictionaries and keep in the same column.
for instance for the first row I would want to see in the data column: 179386782, 18017252, 123452
id
data
12345
[{'id': '179386782'}, {'id': 18017252}, {'id': 123452}]
below is my code to create the dataframe above ( I've hardcoded stories_data as an example)
for business_account in data:
business_account_id = business_account[0]
stories_data = {'data': [{'id': '179386782'}, {'id': '18017252'}, {'id': '123452'}]}
df = pd.DataFrame(stories_data.items())
df.set_index(0, inplace=True)
df = df.transpose()
df_stories['id'] = business_account_id
col = df_stories.pop("id")
df_stories.insert(0, col.name, col)
I've tried this: df_stories["data"].str[0]
but this only returns the first element (dictionary) in the list
Try:
df['data'] = df['data'].apply(lambda x: ', '.join([str(d['id']) for d in x]))
print(df)
# Output:
id data
0 12345 179386782, 18017252, 123452
Another way:
df['data'] = df['data'].explode().str['id'].astype(str) \
.groupby(level=0).agg(', '.join)
print(df)
# Output:
id data
0 12345 179386782, 18017252, 123452

Pandas to mark both if cell value is a substring of another

A column with short and full form of people names, I want to unify them, if the name is a part of the other name. e.g. "James.J" and "James.Jones", I want to tag them both as "James.J".
data = {'Name': ["Amelia.Smith",
"Lucas.M",
"James.J",
"Elijah.Brown",
"Amelia.S",
"James.Jones",
"Benjamin.Johnson"]}
df = pd.DataFrame(data)
I can't figure out how to do it in Pandas. So only a xlrd way, with similarity ratio by SequenceMatcher (and sort it manually in Excel):
import xlrd
from xlrd import open_workbook,cellname
import xlwt
from xlutils.copy import copy
workbook = xlrd.open_workbook("C:\\TEM\\input.xlsx")
old_sheet = workbook.sheet_by_name("Sheet1")
from difflib import SequenceMatcher
wb = copy(workbook)
sheet = wb.get_sheet(0)
for row_index in range(0, old_sheet.nrows):
current = old_sheet.cell(row_index, 0).value
previous = old_sheet.cell(row_index-1, 0).value
sro = SequenceMatcher(None, current.lower(), previous.lower(), autojunk=True).ratio()
if sro > 0.7:
sheet.write(row_index, 1, previous)
sheet.write(row_index-1, 1, previous)
wb.save("C:\\TEM\\output.xls")
What's the nice Pandas way to do it/ Thank you.
using pandas, making use of str.split and .map with some boolean conditions to identify the dupes.
df1 = df['Name'].str.split('.',expand=True).rename(columns={0 : 'FName', 1 :'LName'})
df2 = df1.loc[df1['FName'].duplicated(keep=False)]\
.assign(ky=df['Name'].str.len())\
.sort_values('ky')\
.drop_duplicates(subset=['FName'],keep='first').drop('ky',1)
df['NewName'] = df1['FName'].map(df2.assign(newName=df2.agg('.'.join,1))\
.set_index('FName')['newName'])
print(df)
Name NewName
0 Amelia.Smith Amelia.S
1 Lucas.M NaN
2 James.J James.J
3 Elijah.Brown NaN
4 Amelia.S Amelia.S
5 James.Jones James.J
6 Benjamin.Johnson NaN
Here is an example of using apply with a custom function. For small dfs this should be fine; this will not scale well for large dfs. A more sophisticated data structure for memo would be an ok place to start to improve performance without degrading readability too much:
df = df.sort_values("Name")
def short_name(row, col="Name", memo=[]):
name = row[col]
for m_name in memo:
if name.startswith(m_name):
return m_name
memo.append(name)
return name
df["short_name"] = df.apply(short_name, axis=1)
df = df.sort_index()
output:
Name short_name
0 Amelia.Smith Amelia.S
1 Lucas.M Lucas.M
2 James.J James.J
3 Elijah.Brown Elijah.Brown
4 Amelia.S Amelia.S
5 James.Jones James.J
6 Benjamin.Johnson Benjamin.Johnson