Calculate Percentile Ranks by Group using Numpy - numpy

I'm very new with Python, and I want to calculate percentile ranks by group. My group is wildlife management unit (WMU - string), and ranks are based the value of predicted moose density (PMDEN3 - FLOAT). The rank value goes into the field RankMD.
My approach was to use the for loop to calculate the 3 ranks within each WMU, but the result is that 3 ranks are created for the entire dbf file (about 23,000 records), without respect to WMU. Any help is much appreciated.
import arcpy
import numpy as np
input = r'K:\Moose\KrigStratPython\TestRank3.dbf'
arr = arcpy.da.TableToNumPyArray(input, ('PMDEN3', 'Wmu'))
c_arr = [float(x[0]) for x in np.ndarray.flatten(arr)]
for Wmu in arr:
##to create 3 rank for example
p1 = np.percentile(c_arr, 33) # rank = 0
p2 = np.percentile(c_arr, 67) # rank = 1
p3 = np.percentile(c_arr, 100) # rank = 2
#use cursor to update the new rank field
with arcpy.da.UpdateCursor(input , ['PMDEN3','RankMD']) as cursor:
for row in cursor:
if row[0] < p1:
row[1] = 0 #rank 0
elif p1 <= row[0] and row[0] < p2:
row[1] = 1
else:
row[1] = 2
cursor.updateRow(row)

Your for loop is correct, however, your UpdateCursor is iterating over all rows in the table. To get your desired result you need to select out a subset of the table, and then use the update cursor on that. You can do this by passing a query to the where_clause parameter of the UpdateCursor function.
So you would have a query like this:
current_wmu = WMU['wmu'] # This should be the value of the wmu that the for loop is currently on I think it would be WMU['wmu'] but i'm not positive
where_clause = "WMU = '{}'".format(current_wmu) # format the above variable into a query string
and then your UpdateCursor would now be:
with arcpy.da.UpdateCursor(input , ['PMDEN3','RankMD'], where_clause) as cursor:

Based on suggestion from BigGerman, I revised my code and this is now working. Script loops through each WMU value, and calculates rank percentile within each group based on PMDEN. To improve the script I should create an array of WMU values from my input file rather than manually creating the array.
import arcpy
import numpy as np
#fields to be calculated
fldPMDEN = "PMDEN"
fldRankWMU = "RankWMU"
input = r'K:\Moose\KrigStratPython\TestRank3.dbf'
arcpy.MakeFeatureLayer_management(input, "stratLayerShpNoNullsLyr")
WMUs = ["10", "11A", "11B", "11Q", "12A"]
for current_wmu in WMUs:
##to create 3 rank for example
where_clause = "Wmu = '{}'".format(current_wmu) # format the above variable into a query
with arcpy.da.UpdateCursor("stratLayerShpNoNullsLyr", [fldPMDEN,fldRankWMU], where_clause) as cursor:
arr1 = arcpy.da.TableToNumPyArray("stratLayerShpNoNullsLyr", [fldPMDEN,fldRankWMU], where_clause)
c_arrS = [float(x[0]) for x in np.ndarray.flatten(arr1)]
p1 = np.percentile(c_arrS, 33) # rank = 3
p2 = np.percentile(c_arrS, 67) # rank = 2
p3 = np.percentile(c_arrS, 100) # rank = 1 (highest density)
for row in cursor:
if row[0] < p1:
row[1] = 3 #rank 0
elif p1 <= row[0] and row[0] < p2:
row[1] = 2
else:
row[1] = 1
cursor.updateRow(row)

Related

Creating pandas columns with for loop

I have the following dataframe created through the following chunk of code:
df = pd.DataFrame(
[
(13412339, '07/03/2022', '08/03/2022', '10/03/2022', 1),
(13412343, '07/03/2022', '07/03/2022', '09/03/2022', 0),
(13412489, '07/02/2022', '08/02/2022', '07/03/2022', 0),
],
columns=['task_id', 'start_date', 'end_date', 'end_period', 'status']
)
df = df.astype(dtype={'status' : bool})
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)
df.end_period = pd.to_datetime(df.end_period)
What I need to do here is to calculate the difference in days between the start_date and end_date columns if the status column is False, else it should do the same but between start_date and end_period columns.
The code that I have implemented to calculate the days differences between the start_date and end_date columns is as follows:
new_frame = pd.DataFrame()
for row in range(df.shape[0]):
#extract the row
extracted_row = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff = extracted_row['end_date'] - extracted_row['start_date']
diff_days = diff.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days+1):
new_row = extracted_row.copy()
new_row['date'] = new_row['start_date'] + dt.timedelta(days=i)
new_row = new_row[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame = new_frame.append(new_row)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
Then in order to calculate the differences if the status column is False, I did the following one:
new_frame1 = pd.DataFrame()
new_frame2 = pd.DataFrame()
for row in range(df.shape[0]):
#In this iteration, status column should be equals True
if df['status'] == False:
#extract the row
extracted_row_end = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff1 = extracted_row_end['end_date'] - extracted_row_end['start_date']
diff_days_end = diff1.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_end+1):
new_row_end = extracted_row_end.copy()
new_row_end['date'] = new_row_end['start_date'] + dt.timedelta(days=i)
new_row_end = new_row_end[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame1 = new_frame1.append(new_row_end)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#In this iteration, status column should be equals False
else:
#extract the row
extracted_row_period = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff2 = extracted_row_period['end_period'] - extracted_row_period['start_date']
diff_days_period = diff2.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_period+1):
new_row_period = extracted_row_end.copy()
new_row_period['date'] = new_row_period['start_date'] + dt.timedelta(days=i)
new_row_period = new_row_period[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame2 = new_frame2.append(new_row_period)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#Merges both dataframes
frames = [new_frame1,new_frame2]
df = pd.concat(frames)
Then it throws an error when starts the first for loop, here is where I should be asking help on how to calculate the difference in days between the start_date and end_date columns if the status column is False, else calculate it between start_date and end_period columns.
The complete error is as follows:
Some part of your code did not work on my machine (so I just took the initial df from your first cell) - but when reading what you need, this is what I would do
import numpy as np
df['dayDiff']=np.where(df['status'],(df['end_period']-df['start_date']).dt.days,(df['end_date']-df['start_date']).dt.days)
df
As you already have booleand on df['status'], I would use that to the np.where condition , then either calculate the day difference df['end_period']-df['start_date']).dt.days when True either day difference (df['end_date']-df['start_date']).dt.days when False

Python3 to speed up the computing of dataframe

I have a dataframe (df) as following
id date t_slot dayofweek label
1 2021-01-01 2 0 1
1 2021-01-02 3 1 0
2 2021-01-01 4 6 1
.......
The data frame is very large(6 million rows). the t_slot is from 1 to 6 value. dayofweek is from 0-6.
I want to get the rate:
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
I have used loop to compute the rate, but it is very slow, do you have fast way to compute it. My code is copied as following:
def get_time_slot_rate(df):
import numpy as np
if len(df)==0:
return np.nan, np.nan, np.nan, np.nan
else:
work = df.loc[df['dayofweek']<5]
weekend = df.loc[df['dayofweek']>=5]
if len(work)==0:
work_14, work_56 = np.nan, np.nan
else:
work_14 = len(work.loc[(work['time_slot']<5)*(work['label']==1)])/len(work)
work_56 = len(work.loc[(work['time_slot']>5)*(work['label']==1)])/len(work)
if len(weekend)==0:
weekend_14, weekend_56 = np.nan, np.nan
else:
weekend_14 = len(weekend.loc[(weekend['time_slot']<5)*(weekend['label']==1)])/len(weekend)
weekend_56 = len(weekend.loc[(weekend['time_slot']>5)*(weekend['label']==1)])/len(weekend)
return work_14, work_56, weekend_14, weekend_56
import datetime as d_t
lst_id = list(df['id'])
lst_date = list(df['date'])
lst_t14_work = []
lst_t56_work = []
lst_t14_weekend = []
lst_t56_weekend = []
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = lst_date[i]
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& ((df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date))].reset_index(drop=True)
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df_s)
lst_t14_work.append(work_14_rate)
lst_t56_work.append(work_56_rate)
lst_t14_weekend.append(weekend_14_rate)
lst_t56_weekend.append(weekend_56_rate)
I could only fix your function and it's completely untested, but here we go:
Import only once by putting the imports at the top of your .py.
try/except blocks are more efficient than if/else statements.
True and False equals to 1 and 0 respectively in Python.
Don't multiply boolean selectors and use the reverse operator ~
Create the least amount of copies.
import numpy as np
def get_time_slot_rate(df):
# much faster than counting
if df.empty:
return np.nan, np.nan, np.nan, np.nan
# assuming df['label'] is either 0 or 1
df = df.loc[df['label']]
# create boolean selectors to be inverted with '~'
weekdays = df['dayofweek']<=5
slot_selector = df['time_slot']<=5
weekday_count = np.sum(weekdays)
try:
work_14 = len(df.loc[weekdays & slot_selector])/weekday_count
work_56 = len(df.loc[weekdays & ~slot_selector])/weekday_count
except ZeroDivisionError:
work_14 = work_56 = np.nan
weekend_count = np.sum(~weekdays)
try:
weekend_14 = len(df.loc[~weekdays & slot_selector])/weekend_count
weekend_56 = len(df.loc[~weekdays & ~slot_selector])/weekend_count
except ZeroDivisionError:
weekend_14 = weekend_56 = np.nan
return work_14, work_56, weekend_14, weekend_56
The rest of your script doesn't really make sense, see my comments:
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = date[i]
# what is d_t ?
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& (df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date)].reset_index(drop=True)
# is it df or df_s ?
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df)
If your date column is a datetime object than you can compare dates directly (no need for strings).

Data frame: get row and update it

I want to select a row based on a condition and then update it in dataframe.
One solution I found is to update df based on condition, but I must repeat the condition, what is the better solution so that I get the desired row once and change it?
df.loc[condition, "top"] = 1
df.loc[condition, "pred_text1"] = 2
df.loc[condtion, "pred1_score"] = 3
something like:
row = df.loc[condition]
row["top"] = 1
row["pred_text1"] = 2
row["pred1_score"] = 3
Extract the boolean mask and set it as a variable.
m = condition
df.loc[m, 'top'] = 1
df.loc[m, 'pred_text1'] = 2
df.loc[m, 'pred1_score'] = 3
but the shortest way is:
df.loc[condition, ['top', 'pred_text1', 'pred_score']] = [1, 2, 3]
Update
Wasn't it possible to retrieve the index of row and then update it by that index?
idx = df[condition].idx
df.loc[idx, 'top'] = 1
df.loc[idx, 'pred_text1'] = 2
df.loc[idx, 'pred1_score'] = 3

Can't get dimensions of arrays equal to plot with MatPlotLib

I am trying to create a plot of arrays where one is calculated based on my x-axis calculated in a for loop. I've gone through my code multiple times and tested in between what exactly the lengths are for my arrays, but I can't seem to think of a solution that makes them equal length.
This is the code I have started with:
import numpy as np
import matplotlib.pyplot as plt
a = 1 ;b = 2 ;c = 3; d = 1; e = 2
t0 = 0
t_end = 10
dt = 0.05
t = np.arange(t0, t_end, dt)
n = len(t)
fout = 1
M = 1
Ca = np.zeros(n)
Ca[0] = a; Cb[0] = b
Cc[0] = 0;
k1 = 1
def rA(Ca, Cb, Cc, t):
-k1 * Ca**a * Cb**b * dt
return -k1 * Ca**a * Cb**b * dt
while e > 1e-3:
t = np.arange(t0, t_end, dt)
n = len(t)
for i in range(1,n-1):
Ca[i+1] = Ca[i] + rA(Ca[i], Cb[i], Cc[i], t[i])
e = abs((M-Ca[n-1])/M)
M = Ca[n-1]
dt = dt/2
plt.plot(t, Ca)
plt.grid()
plt.show()
Afterwards, I try to calculate a second function for different y-values. Within the for loop I added:
Cb[i+1] = Cb[i] + rB(Ca[i], Cb[i], Cc[i], t[i])
While also defining rB in a similar manner as rA. The error code I received at this point is:
IndexError: index 200 is out of bounds for axis 0 with size 200
I feel like it has to do with the way I'm initializing the arrays for my Ca. To put it in MatLab code, something I'm more familiar with, looks like this in MatLab:
Ca = zeros(1,n)
I have recreated the code I have written here in MatLab and I do receive a plot. So I'm wondering where I am going wrong here?
So I thought my best course of action was to change n to an int by just changing it in the while loop.
but after changing n = len(t) to n = 100 I received the following error message:
ValueError: x and y must have same first dimension, but have shapes (200,) and (400,)
As my previous question was something trivial I just kept on missing out on, I feel like this is the same. But I have spent over an hour looking and trying fixes without succes.

Fuzzy matching and iteration through DataFrame

I have these two DataFrames: I want to fuzzy match the Surname strings to the corresponding Names
dico = {'Name': ['Arthur','Henri','Lisiane','Patrice'],
"Age": ["20","18","62","73"],
"Studies": ['Economics','Maths','Psychology','Medical']
}
dico2 = {'Surname': ['Henri2','Arthur1','Patrice4','Lisiane3']}
dico = pd.DataFrame.from_dict(dico)
dico2 = pd.DataFrame.from_dict(dico2)
I want to fuzzy match the Surname strings to the corresponding Names to have an output as follows
Name Surname Age Studies
0 Arthur Arthur1 20 Economics
1 Henri Henri2 18 Maths
2 Lisiane Lisiane3 62 Psychology
3 Patrice Patrice4 73 Medical
and here is my code so far:
dico['Surname'] = []
for i in dico2:
lst = [0, 0, 0]
for j in dico:
if lst[0] < fuzz.ratio(i,j):
lst[0] = fuzz.ratio(i,j)
lst[1] = i
lst[2] = j
dico['Surname'].append(i)
but i get a ValueError: Length of values (0) does not match length of index (4), which I don't get why. Thanks !
dico = {'Name': ['Arthur','Henri','Lisiane','Patrice'],
"Age": ["20","18","62","73"],
"Studies": ['Economics','Maths','Psychology','Medical']
}
dico2 = {'Surname': ['Henri2','Arthur1','Patrice4','Lisiane3']}
dico = pd.DataFrame.from_dict(dico)
dico2 = pd.DataFrame.from_dict(dico2)
temp = pd.DataFrame()
for x in range (0, len(dico.Name)):
name_str = dico.Name[x]
temp = pd.concat([temp, dico2[dico2.Surname.str.contains(name_str)].Surname])
temp.columns=['Surname']
temp = temp.reset_index(drop = True)
dico = pd.concat([dico, temp], axis=1)
Solution
map_list = []
for name in dico['Name']:
best_ratio = None
for idx, surname in enumerate(dico2['Surname']):
if best_ratio == None:
best_ratio = fuzz.ratio(name, surname)
best_idx = 0
else:
ratio = fuzz.ratio(name, surname)
if ratio > best_ratio:
best_ratio = ratio
best_idx = idx
map_list.append(dico2['Surname'][best_idx]) # obtain surname
dico['Surname'] = pd.Series(map_list) # add column
dico = dico[["Name", "Surname", "Age", "Studies"]] # reorder columns
Result
The error originates from
dico['Surname'] = []
dico['Surname'] is length 4, while [] is length 0. You can instead collect your surnames in a list and then add the surnames to the dataframe in one go after the loop.
You also need to tell the outer loop to iterate over dico2['Surname'] instead of the entire dataframe.
surnames = []
for i in dico2['Surname']:
lst = [0, 0, 0]
for j in dico:
if lst[0] < fuzz.ratio(i,j):
lst[0] = fuzz.ratio(i,j)
lst[1] = i
lst[2] = j
surnames.append(i)
dico['Surname'] = surnames
EDIT: only fixed the error in question. Also see maxbachmann's advise on not calling fuzz.ratio twice.