I have the following dataframe created through the following chunk of code:
df = pd.DataFrame(
[
(13412339, '07/03/2022', '08/03/2022', '10/03/2022', 1),
(13412343, '07/03/2022', '07/03/2022', '09/03/2022', 0),
(13412489, '07/02/2022', '08/02/2022', '07/03/2022', 0),
],
columns=['task_id', 'start_date', 'end_date', 'end_period', 'status']
)
df = df.astype(dtype={'status' : bool})
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)
df.end_period = pd.to_datetime(df.end_period)
What I need to do here is to calculate the difference in days between the start_date and end_date columns if the status column is False, else it should do the same but between start_date and end_period columns.
The code that I have implemented to calculate the days differences between the start_date and end_date columns is as follows:
new_frame = pd.DataFrame()
for row in range(df.shape[0]):
#extract the row
extracted_row = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff = extracted_row['end_date'] - extracted_row['start_date']
diff_days = diff.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days+1):
new_row = extracted_row.copy()
new_row['date'] = new_row['start_date'] + dt.timedelta(days=i)
new_row = new_row[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame = new_frame.append(new_row)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
Then in order to calculate the differences if the status column is False, I did the following one:
new_frame1 = pd.DataFrame()
new_frame2 = pd.DataFrame()
for row in range(df.shape[0]):
#In this iteration, status column should be equals True
if df['status'] == False:
#extract the row
extracted_row_end = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff1 = extracted_row_end['end_date'] - extracted_row_end['start_date']
diff_days_end = diff1.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_end+1):
new_row_end = extracted_row_end.copy()
new_row_end['date'] = new_row_end['start_date'] + dt.timedelta(days=i)
new_row_end = new_row_end[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame1 = new_frame1.append(new_row_end)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#In this iteration, status column should be equals False
else:
#extract the row
extracted_row_period = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff2 = extracted_row_period['end_period'] - extracted_row_period['start_date']
diff_days_period = diff2.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_period+1):
new_row_period = extracted_row_end.copy()
new_row_period['date'] = new_row_period['start_date'] + dt.timedelta(days=i)
new_row_period = new_row_period[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame2 = new_frame2.append(new_row_period)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#Merges both dataframes
frames = [new_frame1,new_frame2]
df = pd.concat(frames)
Then it throws an error when starts the first for loop, here is where I should be asking help on how to calculate the difference in days between the start_date and end_date columns if the status column is False, else calculate it between start_date and end_period columns.
The complete error is as follows:
Some part of your code did not work on my machine (so I just took the initial df from your first cell) - but when reading what you need, this is what I would do
import numpy as np
df['dayDiff']=np.where(df['status'],(df['end_period']-df['start_date']).dt.days,(df['end_date']-df['start_date']).dt.days)
df
As you already have booleand on df['status'], I would use that to the np.where condition , then either calculate the day difference df['end_period']-df['start_date']).dt.days when True either day difference (df['end_date']-df['start_date']).dt.days when False
Related
I have a dataframe with date column and a holiday column
I have been trying to set even saturdays to 0. For example if a month has 5 saturdays - 2nd and 4th saturday must be set to 0 and other saturdays must be 1.
Code I tried:
import pandas as pd
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df['holiday'] = 1
def is_weekend(d):
if d.weekday() == 6:
return 0
elif d.weekday() == 5 and (d.day-1)//7%2 == 1:
return 0
else:
return 1
df['holiday'] = df.apply(lambda row: is_weekend(row['date']) if row['holiday'] == 1 else 0, axis=1)
so here, all sundays are set to 0 since its a holiday and i tried setting every even saturday to 0. But For example, 30.07.2022 must be 1 (5th saturday) but its being set to 0 with my code.
How can I get only even saturdays as 0?
you can use these codes
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index()
df["day"] = df["date"].dt.day_name()
df["holiday"] = df.apply(lambda x: 0 if any([x["day"]=="Sunday",(x["index"]%2==1 and x["day"]=="Saturday")]) else 1, axis=1)
df = df.drop(["index","day"], axis=1)
df
I have dataframe with column d1 and now i am trying calculate 'out' column after ranking that column when there in 'nan' value with in a column.
data_input = {'Name':['Renault', 'Renault', 'Renault', 'Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault'],
'type':['Duster', 'Duster', 'Duster','Duster','Duster','Duster','Duster','Triber','Triber','Triber','Triber','Triber','Triber','Triber'],
'd1':['nan','10','10','10','nan','nan','20','20','nan','nan','30','30','30','nan']}
df_input = pd.DataFrame(data_input)
data_out = {'Name':['Renault', 'Renault', 'Renault', 'Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault','Renault'],
'type':['Duster', 'Duster', 'Duster','Duster','Duster','Duster','Duster','Triber','Triber','Triber','Triber','Triber','Triber','Triber'],
'd1':['nan','10','10','10','nan','nan','20','20','nan','nan','30','30','30','nan'],
'out':[1,np.NaN,np.NaN,np.NaN,2,2,np.NaN,np.NaN,1,1,np.NaN,np.NaN,np.NaN,2]}
df_out = pd.DataFrame(data_out)
If in that particular group if nan appers before and after some values then rank should be in asscending.
ex: rank for index-0 will be 1 and index-4&5 will be 2(because there is no after values in that group)
df_out["out"] = df_out.groupby(["Name","type"])['d1'].rank(method="first")
Use GroupBy.cumsum by consecutive mising values per groups:
df_out['d1'] = pd.to_numeric(df_out['d1'], errors='coerce')
m = df_out['d1'].isna()
df_out["out1"] = (df_out.assign(a = (m & ~m.shift(fill_value=False)))
.groupby(["Name","type"])['a']
.cumsum()
.where(m))
Alternative solution with boolean indexing:
df_out["out1"] = (df_out.assign(a = (m & ~m.shift(fill_value=False)))[m]
.groupby(["Name","type"])['a']
.cumsum())
I have a dataframe (df) as following
id date t_slot dayofweek label
1 2021-01-01 2 0 1
1 2021-01-02 3 1 0
2 2021-01-01 4 6 1
.......
The data frame is very large(6 million rows). the t_slot is from 1 to 6 value. dayofweek is from 0-6.
I want to get the rate:
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
I have used loop to compute the rate, but it is very slow, do you have fast way to compute it. My code is copied as following:
def get_time_slot_rate(df):
import numpy as np
if len(df)==0:
return np.nan, np.nan, np.nan, np.nan
else:
work = df.loc[df['dayofweek']<5]
weekend = df.loc[df['dayofweek']>=5]
if len(work)==0:
work_14, work_56 = np.nan, np.nan
else:
work_14 = len(work.loc[(work['time_slot']<5)*(work['label']==1)])/len(work)
work_56 = len(work.loc[(work['time_slot']>5)*(work['label']==1)])/len(work)
if len(weekend)==0:
weekend_14, weekend_56 = np.nan, np.nan
else:
weekend_14 = len(weekend.loc[(weekend['time_slot']<5)*(weekend['label']==1)])/len(weekend)
weekend_56 = len(weekend.loc[(weekend['time_slot']>5)*(weekend['label']==1)])/len(weekend)
return work_14, work_56, weekend_14, weekend_56
import datetime as d_t
lst_id = list(df['id'])
lst_date = list(df['date'])
lst_t14_work = []
lst_t56_work = []
lst_t14_weekend = []
lst_t56_weekend = []
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = lst_date[i]
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& ((df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date))].reset_index(drop=True)
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df_s)
lst_t14_work.append(work_14_rate)
lst_t56_work.append(work_56_rate)
lst_t14_weekend.append(weekend_14_rate)
lst_t56_weekend.append(weekend_56_rate)
I could only fix your function and it's completely untested, but here we go:
Import only once by putting the imports at the top of your .py.
try/except blocks are more efficient than if/else statements.
True and False equals to 1 and 0 respectively in Python.
Don't multiply boolean selectors and use the reverse operator ~
Create the least amount of copies.
import numpy as np
def get_time_slot_rate(df):
# much faster than counting
if df.empty:
return np.nan, np.nan, np.nan, np.nan
# assuming df['label'] is either 0 or 1
df = df.loc[df['label']]
# create boolean selectors to be inverted with '~'
weekdays = df['dayofweek']<=5
slot_selector = df['time_slot']<=5
weekday_count = np.sum(weekdays)
try:
work_14 = len(df.loc[weekdays & slot_selector])/weekday_count
work_56 = len(df.loc[weekdays & ~slot_selector])/weekday_count
except ZeroDivisionError:
work_14 = work_56 = np.nan
weekend_count = np.sum(~weekdays)
try:
weekend_14 = len(df.loc[~weekdays & slot_selector])/weekend_count
weekend_56 = len(df.loc[~weekdays & ~slot_selector])/weekend_count
except ZeroDivisionError:
weekend_14 = weekend_56 = np.nan
return work_14, work_56, weekend_14, weekend_56
The rest of your script doesn't really make sense, see my comments:
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = date[i]
# what is d_t ?
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& (df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date)].reset_index(drop=True)
# is it df or df_s ?
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df)
If your date column is a datetime object than you can compare dates directly (no need for strings).
I'm very new with Python, and I want to calculate percentile ranks by group. My group is wildlife management unit (WMU - string), and ranks are based the value of predicted moose density (PMDEN3 - FLOAT). The rank value goes into the field RankMD.
My approach was to use the for loop to calculate the 3 ranks within each WMU, but the result is that 3 ranks are created for the entire dbf file (about 23,000 records), without respect to WMU. Any help is much appreciated.
import arcpy
import numpy as np
input = r'K:\Moose\KrigStratPython\TestRank3.dbf'
arr = arcpy.da.TableToNumPyArray(input, ('PMDEN3', 'Wmu'))
c_arr = [float(x[0]) for x in np.ndarray.flatten(arr)]
for Wmu in arr:
##to create 3 rank for example
p1 = np.percentile(c_arr, 33) # rank = 0
p2 = np.percentile(c_arr, 67) # rank = 1
p3 = np.percentile(c_arr, 100) # rank = 2
#use cursor to update the new rank field
with arcpy.da.UpdateCursor(input , ['PMDEN3','RankMD']) as cursor:
for row in cursor:
if row[0] < p1:
row[1] = 0 #rank 0
elif p1 <= row[0] and row[0] < p2:
row[1] = 1
else:
row[1] = 2
cursor.updateRow(row)
Your for loop is correct, however, your UpdateCursor is iterating over all rows in the table. To get your desired result you need to select out a subset of the table, and then use the update cursor on that. You can do this by passing a query to the where_clause parameter of the UpdateCursor function.
So you would have a query like this:
current_wmu = WMU['wmu'] # This should be the value of the wmu that the for loop is currently on I think it would be WMU['wmu'] but i'm not positive
where_clause = "WMU = '{}'".format(current_wmu) # format the above variable into a query string
and then your UpdateCursor would now be:
with arcpy.da.UpdateCursor(input , ['PMDEN3','RankMD'], where_clause) as cursor:
Based on suggestion from BigGerman, I revised my code and this is now working. Script loops through each WMU value, and calculates rank percentile within each group based on PMDEN. To improve the script I should create an array of WMU values from my input file rather than manually creating the array.
import arcpy
import numpy as np
#fields to be calculated
fldPMDEN = "PMDEN"
fldRankWMU = "RankWMU"
input = r'K:\Moose\KrigStratPython\TestRank3.dbf'
arcpy.MakeFeatureLayer_management(input, "stratLayerShpNoNullsLyr")
WMUs = ["10", "11A", "11B", "11Q", "12A"]
for current_wmu in WMUs:
##to create 3 rank for example
where_clause = "Wmu = '{}'".format(current_wmu) # format the above variable into a query
with arcpy.da.UpdateCursor("stratLayerShpNoNullsLyr", [fldPMDEN,fldRankWMU], where_clause) as cursor:
arr1 = arcpy.da.TableToNumPyArray("stratLayerShpNoNullsLyr", [fldPMDEN,fldRankWMU], where_clause)
c_arrS = [float(x[0]) for x in np.ndarray.flatten(arr1)]
p1 = np.percentile(c_arrS, 33) # rank = 3
p2 = np.percentile(c_arrS, 67) # rank = 2
p3 = np.percentile(c_arrS, 100) # rank = 1 (highest density)
for row in cursor:
if row[0] < p1:
row[1] = 3 #rank 0
elif p1 <= row[0] and row[0] < p2:
row[1] = 2
else:
row[1] = 1
cursor.updateRow(row)
I've been trying to vectorize the following with no such luck:
Consider two data frames. One is a list of dates:
cols = ['col1', 'col2']
index = pd.date_range('1/1/15','8/31/18')
df = pd.DataFrame(columns = cols )
What i'm doing currently is looping thru df and getting the counts of all rows that are less than or equal to the date in question with my main (large) dataframe df_main
for x in range(len(index)):
temp_arr = []
active = len(df_main[(df_main.n_date <= index[x])]
temp_arr = [index[x],active]
df= df.append(pd.Series(temp_arr,index=cols) ,ignore_index=True)
Is there a way to vectorize the above?
What about something like the following
#initializing
mycols = ['col1', 'col2']
myindex = pd.date_range('1/1/15','8/31/18')
mydf = pd.DataFrame(columns = mycols )
#create df_main (that has each of myindex's dates minus 10 days)
df_main = pd.DataFrame(data=myindex-pd.Timedelta(days=10), columns=['n_date'])
#wrap a dataframe around a list comprehension
mydf = pd.DataFrame([[x, len(df_main[df_main['n_date'] <= x])] for x in myindex])