Fuzzy matching and iteration through DataFrame - pandas

I have these two DataFrames: I want to fuzzy match the Surname strings to the corresponding Names
dico = {'Name': ['Arthur','Henri','Lisiane','Patrice'],
"Age": ["20","18","62","73"],
"Studies": ['Economics','Maths','Psychology','Medical']
}
dico2 = {'Surname': ['Henri2','Arthur1','Patrice4','Lisiane3']}
dico = pd.DataFrame.from_dict(dico)
dico2 = pd.DataFrame.from_dict(dico2)
I want to fuzzy match the Surname strings to the corresponding Names to have an output as follows
Name Surname Age Studies
0 Arthur Arthur1 20 Economics
1 Henri Henri2 18 Maths
2 Lisiane Lisiane3 62 Psychology
3 Patrice Patrice4 73 Medical
and here is my code so far:
dico['Surname'] = []
for i in dico2:
lst = [0, 0, 0]
for j in dico:
if lst[0] < fuzz.ratio(i,j):
lst[0] = fuzz.ratio(i,j)
lst[1] = i
lst[2] = j
dico['Surname'].append(i)
but i get a ValueError: Length of values (0) does not match length of index (4), which I don't get why. Thanks !

dico = {'Name': ['Arthur','Henri','Lisiane','Patrice'],
"Age": ["20","18","62","73"],
"Studies": ['Economics','Maths','Psychology','Medical']
}
dico2 = {'Surname': ['Henri2','Arthur1','Patrice4','Lisiane3']}
dico = pd.DataFrame.from_dict(dico)
dico2 = pd.DataFrame.from_dict(dico2)
temp = pd.DataFrame()
for x in range (0, len(dico.Name)):
name_str = dico.Name[x]
temp = pd.concat([temp, dico2[dico2.Surname.str.contains(name_str)].Surname])
temp.columns=['Surname']
temp = temp.reset_index(drop = True)
dico = pd.concat([dico, temp], axis=1)

Solution
map_list = []
for name in dico['Name']:
best_ratio = None
for idx, surname in enumerate(dico2['Surname']):
if best_ratio == None:
best_ratio = fuzz.ratio(name, surname)
best_idx = 0
else:
ratio = fuzz.ratio(name, surname)
if ratio > best_ratio:
best_ratio = ratio
best_idx = idx
map_list.append(dico2['Surname'][best_idx]) # obtain surname
dico['Surname'] = pd.Series(map_list) # add column
dico = dico[["Name", "Surname", "Age", "Studies"]] # reorder columns
Result

The error originates from
dico['Surname'] = []
dico['Surname'] is length 4, while [] is length 0. You can instead collect your surnames in a list and then add the surnames to the dataframe in one go after the loop.
You also need to tell the outer loop to iterate over dico2['Surname'] instead of the entire dataframe.
surnames = []
for i in dico2['Surname']:
lst = [0, 0, 0]
for j in dico:
if lst[0] < fuzz.ratio(i,j):
lst[0] = fuzz.ratio(i,j)
lst[1] = i
lst[2] = j
surnames.append(i)
dico['Surname'] = surnames
EDIT: only fixed the error in question. Also see maxbachmann's advise on not calling fuzz.ratio twice.

Related

Data frame: get row and update it

I want to select a row based on a condition and then update it in dataframe.
One solution I found is to update df based on condition, but I must repeat the condition, what is the better solution so that I get the desired row once and change it?
df.loc[condition, "top"] = 1
df.loc[condition, "pred_text1"] = 2
df.loc[condtion, "pred1_score"] = 3
something like:
row = df.loc[condition]
row["top"] = 1
row["pred_text1"] = 2
row["pred1_score"] = 3
Extract the boolean mask and set it as a variable.
m = condition
df.loc[m, 'top'] = 1
df.loc[m, 'pred_text1'] = 2
df.loc[m, 'pred1_score'] = 3
but the shortest way is:
df.loc[condition, ['top', 'pred_text1', 'pred_score']] = [1, 2, 3]
Update
Wasn't it possible to retrieve the index of row and then update it by that index?
idx = df[condition].idx
df.loc[idx, 'top'] = 1
df.loc[idx, 'pred_text1'] = 2
df.loc[idx, 'pred1_score'] = 3

I am trying to append values to an empty 2D array dynamically but getting an error everytime

output = np.empty([17157,4])
for every row in data
for rows in data:
initializing variables
snowfall = 0
positive_temp = 0
mass_balance = 0
melt = 0
for every cell in a row
for columns in range(12):
if rows[columns+2] < 0:
snowfall += rows[columns+14]
else:
positive_temp += rows[columns+2]
melt += positive_temp * 7
mass_balance += snowfall - melt
lat = rows[0]
lon = rows[1]
elev = rows[26]
appending values to output
np.append(output, ([lat, lon, mass_balance, elev]), axis = 0)

Dataframe index rows all 0's

I'm iterating through PDF's to obtain the text entered in the form fields. When I send the rows to a csv file it only exports the last row. When I print results from the Dataframe, all the row indexes are 0's. I have tried various solutions from stackoverflow, but I can't get anything to work, what should be 0, 1, 2, 3...etc. are coming in as 0, 0, 0, 0...etc.
Here is what I get when printing results, only the last row exports to csv file:
0
0 1938282828
0
0 1938282828
0
0 22222222
infile = glob.glob('./*.pdf')
for i in infile:
if i.endswith('.pdf'):
pdreader = PdfFileReader(open(i,'rb'))
diction = pdreader.getFormTextFields()
myfieldvalue2 = str(diction['ID'])
df = pd.DataFrame([myfieldvalue2])
print(df)`
Thank you for any help!
You are replacing the same dataframe each time:
infile = glob.glob('./*.pdf')
for i in infile:
if i.endswith('.pdf'):
pdreader = PdfFileReader(open(i,'rb'))
diction = pdreader.getFormTextFields()
myfieldvalue2 = str(diction['ID'])
df = pd.DataFrame([myfieldvalue2]) # this creates new df each time
print(df)
Correct Code:
infile = glob.glob('./*.pdf')
df = pd.DataFrame()
for i in infile:
if i.endswith('.pdf'):
pdreader = PdfFileReader(open(i,'rb'))
diction = pdreader.getFormTextFields()
myfieldvalue2 = str(diction['ID'])
df = df.append([myfieldvalue2])
print(df)

Calculate Percentile Ranks by Group using Numpy

I'm very new with Python, and I want to calculate percentile ranks by group. My group is wildlife management unit (WMU - string), and ranks are based the value of predicted moose density (PMDEN3 - FLOAT). The rank value goes into the field RankMD.
My approach was to use the for loop to calculate the 3 ranks within each WMU, but the result is that 3 ranks are created for the entire dbf file (about 23,000 records), without respect to WMU. Any help is much appreciated.
import arcpy
import numpy as np
input = r'K:\Moose\KrigStratPython\TestRank3.dbf'
arr = arcpy.da.TableToNumPyArray(input, ('PMDEN3', 'Wmu'))
c_arr = [float(x[0]) for x in np.ndarray.flatten(arr)]
for Wmu in arr:
##to create 3 rank for example
p1 = np.percentile(c_arr, 33) # rank = 0
p2 = np.percentile(c_arr, 67) # rank = 1
p3 = np.percentile(c_arr, 100) # rank = 2
#use cursor to update the new rank field
with arcpy.da.UpdateCursor(input , ['PMDEN3','RankMD']) as cursor:
for row in cursor:
if row[0] < p1:
row[1] = 0 #rank 0
elif p1 <= row[0] and row[0] < p2:
row[1] = 1
else:
row[1] = 2
cursor.updateRow(row)
Your for loop is correct, however, your UpdateCursor is iterating over all rows in the table. To get your desired result you need to select out a subset of the table, and then use the update cursor on that. You can do this by passing a query to the where_clause parameter of the UpdateCursor function.
So you would have a query like this:
current_wmu = WMU['wmu'] # This should be the value of the wmu that the for loop is currently on I think it would be WMU['wmu'] but i'm not positive
where_clause = "WMU = '{}'".format(current_wmu) # format the above variable into a query string
and then your UpdateCursor would now be:
with arcpy.da.UpdateCursor(input , ['PMDEN3','RankMD'], where_clause) as cursor:
Based on suggestion from BigGerman, I revised my code and this is now working. Script loops through each WMU value, and calculates rank percentile within each group based on PMDEN. To improve the script I should create an array of WMU values from my input file rather than manually creating the array.
import arcpy
import numpy as np
#fields to be calculated
fldPMDEN = "PMDEN"
fldRankWMU = "RankWMU"
input = r'K:\Moose\KrigStratPython\TestRank3.dbf'
arcpy.MakeFeatureLayer_management(input, "stratLayerShpNoNullsLyr")
WMUs = ["10", "11A", "11B", "11Q", "12A"]
for current_wmu in WMUs:
##to create 3 rank for example
where_clause = "Wmu = '{}'".format(current_wmu) # format the above variable into a query
with arcpy.da.UpdateCursor("stratLayerShpNoNullsLyr", [fldPMDEN,fldRankWMU], where_clause) as cursor:
arr1 = arcpy.da.TableToNumPyArray("stratLayerShpNoNullsLyr", [fldPMDEN,fldRankWMU], where_clause)
c_arrS = [float(x[0]) for x in np.ndarray.flatten(arr1)]
p1 = np.percentile(c_arrS, 33) # rank = 3
p2 = np.percentile(c_arrS, 67) # rank = 2
p3 = np.percentile(c_arrS, 100) # rank = 1 (highest density)
for row in cursor:
if row[0] < p1:
row[1] = 3 #rank 0
elif p1 <= row[0] and row[0] < p2:
row[1] = 2
else:
row[1] = 1
cursor.updateRow(row)

ValueError: too many values to unpack (expected 4)

I am getting "ValueError: too many values to unpack (expected 4)" with the below code. Please help me!!
I am trying to lemmatize and cut off common words and then add to library so I can identify most common words and find the relationship between words.
def build_dataset(words, vocabulary_size):
lexicon = []
for l in words:
all_words = word_tokenize(l.lower())
lexicon += list(all_words )
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
word = []
for w in w_counts:
if 5000 > w_counts[w] > 50 :
word.append(w)
print(len(word))
return word
count = [['UNK', -1]]
count.extend(collections.Counter(word).most_common(vocabulary_size - 1))
dictionary = dict()
for l2, _ in count:
dictionary[l2] = len(dictionary)
data = list()
unk_count = 0
for l2 in word:
if l2 in dictionary:
index = dictionary[l2]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)