Extract 1st Column data and update 2ndColumn based on 1st Column data - pandas

I have an excel file with the following data:
LogID
T-1111
P-09899
P-09189,T-0011
T-111,T-2111
P-09099,P-7897
RCT-0989,RCT-099
I need to extract the first column LogID before the delimiter "-" and then populate a second column 'LogType' based on the string extracted (T is Tank LogType, P is Pump LogType)
For the above input, the output should be
LogID
LogType
T-1111
Tank
P-09899
Pump
P-09189,T-0011
Multiple
T-111,T-2111
Tank
P-09099,P-7897
Pump
RCT-0989,RCT-099
Reactor
I have written a function to do this in python:
def log_parser(log_string):
log_dict = { "T":"Tank","P":"Pump" }
log_list = log_string.split(",")
for i in log_list:
str_extract = i.upper().split("-",1)
if len(log_list) ==1:
result = log_dict[str_extract[0]]
return result
break
else:
idx = log_list.index(i)
for j in range(len(log_list)):
if (idx == j):
continue
str_extract_j = log_list[j].upper().split("-",1)
if str_extract_j[0] != str_extract[0]:
result = "Multiple"
return result
break
else:
result = log_dict[str_extract[0]]
return result
I am not sure how to implement this function in pandas..
Can i define the function in pandas and then use the lamba apply funtion like this:
test_df['LogType'] = test_df[['LogID']].apply(lambda x:log_parser(x), axis=1)

You can use:
# mapping dictionary for types
d = {'T': 'Tank', 'P': 'Pump'}
# extract letters before -
s = df['LogID'].str.extractall('([A-Z])-')[0]
# group by index
g = s.groupby(level=0)
df['LogType'] = (g.first() # get first match
.map(d) # map type name
# mask if several types
.mask(g.nunique().gt(1),
'Multiple')
)
Output:
LogID LogType
0 T-1111 Tank
1 P-09899 Pump
2 P-09189,T-0011 Multiple

Related

Better way to iterate through rows in a data frame and conditionally assign a group

I have created a function that assigns which latitude & longitude category each row fall in to. However, the issue is way too slow. How can I increase the performance?
Here is my code.
def assign_segment(use_df: pd.DataFrame,
lat_categories: pd.core.indexes.interval.IntervalIndex,
lng_categories: pd.core.indexes.interval.IntervalIndex) -> pd.DataFrame:
"""
Assign segments based on the latitude and longtitude column of "use_tb".
Parameters
----------
use_df : pd.DataFrame
Use DataFrame.
lat_categories : pd.core.indexes.interval.IntervalIndex
Latitude interval categories.
(ex.) IntervalIndex([(35.809, 35.816], (35.816, 35.824],
(35.824, 35.832], (35.832, 35.84], (35.84, 35.848]])
lng_categories : pd.core.indexes.interval.IntervalIndex
Lontitude interval categories.
(ex.) IntervalIndex([(128.668, 128.685], (128.685, 128.703],
(128.703, 128.72], (128.72, 128.737]])
Returns
-------
use_df : pd.DataFrame
"use_df" with segments assigned.
"""
segment = []
# iterate each row and get the segment according to latitude and longitude
for idx, row in use_df.iterrows():
use_lat = row['use_lat']
use_lng = row['use_lng']
for lat_idx, lat_category in enumerate(lat_categories):
if use_lat in lat_category:
lat_segment = lat_idx + 1
break
for lng_idx, lng_category in enumerate(lng_categories):
if use_lng in lng_category:
lng_segment = lng_idx + 1
break
num_lng_grid = len(lat_categories) # number of longtitude grid
lng_num_digits = len(str(num_lng_grid)) # number of digits of lng_grid
segment.append((lat_segment*10**lng_num_digits)+lng_segment)
# create the segment column with the segment list that we created in this function
use_df['segment'] = segment
return use_df
iterrows() is very slow when iterating over rows. Some developers think that iterrows should never be used. We have to resort to some vectorization to speed up the code. you can use tqdm.
from tqdm import tqdm
def assign_segment(use_df: pd.DataFrame,
lat_categories: pd.core.indexes.interval.IntervalIndex,
lng_categories: pd.core.indexes.interval.IntervalIndex) -> pd.DataFrame:
"""
Assign segments based on the latitude and longtitude column of "use_tb".
Parameters
----------
use_df : pd.DataFrame
Use DataFrame.
lat_categories : pd.core.indexes.interval.IntervalIndex
Latitude interval categories.
(ex.) IntervalIndex([(35.809, 35.816], (35.816, 35.824],
(35.824, 35.832], (35.832, 35.84], (35.84, 35.848]])
lng_categories : pd.core.indexes.interval.IntervalIndex
Lontitude interval categories.
(ex.) IntervalIndex([(128.668, 128.685], (128.685, 128.703],
(128.703, 128.72], (128.72, 128.737]])
Returns
-------
use_df : pd.DataFrame
"use_df" with segments assigned.
"""
segment = []
# iterate each row and get the segment according to latitude and longitude
for row in tqdm(use_df.to_dict('records')):
use_lat = row['use_lat']
use_lng = row['use_lng']
for lat_idx, lat_category in enumerate(lat_categories):
if use_lat in lat_category:
lat_segment = lat_idx + 1
break
for lng_idx, lng_category in enumerate(lng_categories):
if use_lng in lng_category:
lng_segment = lng_idx + 1
break
num_lng_grid = len(lat_categories) # number of longtitude grid
lng_num_digits = len(str(num_lng_grid)) # number of digits of lng_grid
segment.append((lat_segment*10**lng_num_digits)+lng_segment)
# create the segment column with the segment list that we created in this function
use_df['segment'] = segment
return use_df
It is worth reading these articles:
How To Make Your Pandas Loop 71803 Times Faster
Stop Using iterrows()
This is all you need to do here...
lat_segments = lat_categories.get_indexer(df['use_lat'])
lng_segments = lng_categories.get_indexer(df['use_lng'])
num_lng_grid = len(lat_categories)
lng_num_digits = len(str(num_lng_grid))
df['segment'] = (lat_segments*10**lng_num_digits)+lng_segments

Map elements of multiple columns in Pandas

I'm trying to label some values in a DataFrame in Pandas based on the value itself, in-place.
df = pd.read_csv('data/extrusion.csv')
# get list of columns that contain thickness
columns = [c for c in data.columns if 'SDickeIst'.lower() in c.lower()]
# create a function that returns the class based on value
def get_label(ser):
ser.map(lambda x : x if x == 0 else 1)
df[columns].apply(get_label)
I would expect that the apply function takes each column in particular and applies get_label on it. In turn, get_label gets the ser argument as a Series and uses map to map each element != 0 with 1.
get_label doesn't return anything.
You want to return ser.map(lambda x : x if x == 0 else 1).
def get_label(ser):
return ser.map(lambda x : x if x == 0 else 1)
Besides that, apply doesn't act in-place, it always returns a new object. Therefore you need
df[columns] = df[columns].apply(get_label)
But in this simple case, using DataFrame.where should be much faster if you are dealing with large DataFrames.
df[columns] = df[columns].where(lambda x: x == 0, 1)

Pandas Data frame column condition check based on length of the value

I have pandas data frame which gets created by reading an excel file. The excel file has a column called serial number. Then I pass a serial number to another function which connect to API and fetch me the result set for those serial number.
My Code -:
def create_excel(filename):
try:
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28],converters={'Serial Number': '{:0>32}'.format})
except Exception as e:
sys.exit("Error reading %s: %s" % (filename, e))
data["Subject Organization"].fillna("N/A",inplace= True)
df = data[data['Subject Organization'].str.contains("Fannie",case = False)]
#df['Serial Number'].apply(lamda x: '000'+x if len(x) == 29 else '00'+x if len(x) == 30 else '0'+x if len(x) == 31 else x)
print(df)
df.to_excel(r'Data.xlsx',index= False)
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
The problem I am facing is I want to check if df5 returned by fetch_by_ser_no() is blank then make the serial number as 34 characters by adding two more leading 00 and then check the function again.
How can I do it by not creating multiple dataframe
Any help!!
Thanks
You can try to use if ... else ...:
output = df['Serial Number'].apply(lambda x: 'ok' if fetch_by_ser_no(x) else 'badly')

Recursively update the dataframe

I have a dataframe called datafe from which I want to combine the hyphenated words.
for example input dataframe looks like this:
,author_ex
0,Marios
1,Christodoulou
2,Intro-
3,duction
4,Simone
5,Speziale
6,Exper-
7,iment
And the output dataframe should be like:
,author_ex
0,Marios
1,Christodoulou
2,Introduction
3,Simone
4,Speziale
5,Experiment
I have written a sample code to achieve this but I am not able to get out of the recursion safely.
def rm_actual(datafe, index):
stem1 = datafe.iloc[index]['author_ex']
stem2 = datafe.iloc[index + 1]['author_ex']
fixed_token = stem1[:-1] + stem2
datafe.drop(index=index + 1, inplace=True, axis=0)
newdf=datafe.reset_index(drop=True)
newdf.iloc[index]['author_ex'] = fixed_token
return newdf
def remove_hyphens(datafe):
for index, row in datafe.iterrows():
flag = False
token=row['author_ex']
if token[-1:] == '-':
datafe=rm_actual(datafe, index)
flag=True
break
if flag==True:
datafe=remove_hyphens(datafe)
if flag==False:
return datafe
datafe=remove_hyphens(datafe)
print(datafe)
Is there any possibilities I can get out of this recursion with expected output?
Another option:
Given/Input:
author_ex
0 Marios
1 Christodoulou
2 Intro-
3 duction
4 Simone
5 Speziale
6 Exper-
7 iment
Code:
import pandas as pd
# read/open file or create dataframe
df = pd.DataFrame({'author_ex':['Marios', 'Christodoulou', 'Intro-', \
'duction', 'Simone', 'Speziale', 'Exper-', 'iment']})
# check input format
print(df)
# create new column 'Ending' for True/False if column 'author_ex' ends with '-'
df['Ending'] = df['author_ex'].shift(1).str.contains('-$', na=False, regex=True)
# remove the trailing '-' from the 'author_ex' column
df['author_ex'] = df['author_ex'].str.replace('-$', '', regex=True)
# create new column with values of 'author_ex' and shifted 'author_ex' concatenated together
df['author_ex_combined'] = df['author_ex'] + df.shift(-1)['author_ex']
# create a series true/false but shifted up
index = (df['Ending'] == True).shift(-1)
# set the last row to 'False' after it was shifted
index.iloc[-1] = False
# replace 'author_ex' with 'author_ex_combined' based on true/false of index series
df.loc[index,'author_ex'] = df['author_ex_combined']
# remove rows that have the 2nd part of the 'author_ex' string and are no longer required
df = df[~df.Ending]
# remove the extra columns
df.drop(['Ending', 'author_ex_combined'], axis = 1, inplace=True)
# output final dataframe
print('\n\n')
print(df)
# notice index 3 and 6 are missing
Outputs:
author_ex
0 Marios
1 Christodoulou
2 Introduction
4 Simone
5 Speziale
6 Experiment

How to map different indices in Pyomo?

I am a new Pyomo/Python user. Now I need to formulate one set of constraints with index 'n', where all of the 3 components are with different indices but correlate with index 'n'. I am just curious that how I can map the relationship between these sets.
In my case, I read csv files in which their indices are related to 'n' to generate my set. For example: a1.n1, a2.n3, a3.n5 /// b1.n2, b2.n4, b3.n6, b4.n7 /// c1.n1, c2.n2, c3.n4, c4.n6 ///. The constraint expression of index n1 and n2 is the follows for example:
for n1: P(a1.n1) + L(c1.n1) == D(n1)
for n2: - F(b1.n2) + L(c2.n2) == D(n2)
Now let's go the coding. The set creating codes are as follow, they are within a class:
import pyomo
import pandas
import pyomo.opt
import pyomo.environ as pe
class MyModel:
def __init__(self, Afile, Bfile, Cfile):
self.A_data = pandas.read_csv(Afile)
self.A_data.set_index(['a'], inplace = True)
self.A_data.sort_index(inplace = True)
self.A_set = self.A_data.index.unique()
... ...
Then I tried to map the relationship in the constraint construction like follows:
def createModel(self):
self.m = pe.ConcreteModel()
self.m.A_set = pe.Set( initialize = self.A_set )
def obj_rule(m):
return ...
self.m.OBJ = pe.Objective(rule = obj_rule, sense = pe.minimize)
def constr(m, n)
As = self.A_data.reset_index()
Amap = As[ As['n'] == n ]['a']
Bs = self.B_data.reset_index()
Bmap = Bs[ Bs['n'] == n ]['b']
Cs = self.C_data.reset_index()
Cmap = Cs[ Cs['n'] == n ]['c']
return sum(m.P[(p,n)] for p in Amap) - sum(m.F[(s,n)] for s in Bmap) + sum(m.L[(r,n)] for r in Cmap) == self.D_data.ix[n, 'D']
self.m.cons = pe.Constraint(self.m.D_set, rule = constr)
def solve(self):
... ...
Finally, the error raises when I run this:
KeyError: "Index '(1, 1)' is not valid for indexed component 'P'"
I know it is the wrong way, so I am wondering if there is a good way to map their relationships. Thanks in advance!
Gabriel
I just forgot to post my answer to my own question when I solved this one week ago. The key thing towards this problem is setting up a map index.
Let me just modify the code in the question. Firstly, we need to modify the dataframe to include the information of the mapped indices. Then, the set for the mapped index can be constructed, taking 2 mapped indices as example:
self.m.A_set = pe.Set( initialize = self.A_set, dimen = 2 )
The names of the two mapped indices are 'alpha' and 'beta' respectively. Then the constraint can be formulated, based on the variables declared at the beginning:
def constr(m, n)
Amap = self.A_data[ self.A_data['alpha'] == n ]['beta']
Bmap = self.B_data[ self.B_data['alpha'] == n ]['beta']
return sum(m.P[(i,n)] for i in Amap) + sum(m.L[(r,n)] for r in Bmap) == D.loc[n, 'D']
m.TravelingBal = pe.Constraint(m.A_set, rule = constr)
The summation groups all associated B to A with a mapped index set.