set all even saturdays of every month to zero - pandas

I have a dataframe with date column and a holiday column
I have been trying to set even saturdays to 0. For example if a month has 5 saturdays - 2nd and 4th saturday must be set to 0 and other saturdays must be 1.
Code I tried:
import pandas as pd
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df['holiday'] = 1
def is_weekend(d):
if d.weekday() == 6:
return 0
elif d.weekday() == 5 and (d.day-1)//7%2 == 1:
return 0
else:
return 1
df['holiday'] = df.apply(lambda row: is_weekend(row['date']) if row['holiday'] == 1 else 0, axis=1)
so here, all sundays are set to 0 since its a holiday and i tried setting every even saturday to 0. But For example, 30.07.2022 must be 1 (5th saturday) but its being set to 0 with my code.
How can I get only even saturdays as 0?

you can use these codes
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index()
df["day"] = df["date"].dt.day_name()
df["holiday"] = df.apply(lambda x: 0 if any([x["day"]=="Sunday",(x["index"]%2==1 and x["day"]=="Saturday")]) else 1, axis=1)
df = df.drop(["index","day"], axis=1)
df

Related

Creating pandas columns with for loop

I have the following dataframe created through the following chunk of code:
df = pd.DataFrame(
[
(13412339, '07/03/2022', '08/03/2022', '10/03/2022', 1),
(13412343, '07/03/2022', '07/03/2022', '09/03/2022', 0),
(13412489, '07/02/2022', '08/02/2022', '07/03/2022', 0),
],
columns=['task_id', 'start_date', 'end_date', 'end_period', 'status']
)
df = df.astype(dtype={'status' : bool})
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)
df.end_period = pd.to_datetime(df.end_period)
What I need to do here is to calculate the difference in days between the start_date and end_date columns if the status column is False, else it should do the same but between start_date and end_period columns.
The code that I have implemented to calculate the days differences between the start_date and end_date columns is as follows:
new_frame = pd.DataFrame()
for row in range(df.shape[0]):
#extract the row
extracted_row = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff = extracted_row['end_date'] - extracted_row['start_date']
diff_days = diff.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days+1):
new_row = extracted_row.copy()
new_row['date'] = new_row['start_date'] + dt.timedelta(days=i)
new_row = new_row[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame = new_frame.append(new_row)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
Then in order to calculate the differences if the status column is False, I did the following one:
new_frame1 = pd.DataFrame()
new_frame2 = pd.DataFrame()
for row in range(df.shape[0]):
#In this iteration, status column should be equals True
if df['status'] == False:
#extract the row
extracted_row_end = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff1 = extracted_row_end['end_date'] - extracted_row_end['start_date']
diff_days_end = diff1.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_end+1):
new_row_end = extracted_row_end.copy()
new_row_end['date'] = new_row_end['start_date'] + dt.timedelta(days=i)
new_row_end = new_row_end[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame1 = new_frame1.append(new_row_end)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#In this iteration, status column should be equals False
else:
#extract the row
extracted_row_period = df.loc[row,:]
#Calculates the date difference in days for each row in the loop
diff2 = extracted_row_period['end_period'] - extracted_row_period['start_date']
diff_days_period = diff2.days
#Iterate over these date differences and repeat the row for each full day
for i in range(diff_days_period+1):
new_row_period = extracted_row_end.copy()
new_row_period['date'] = new_row_period['start_date'] + dt.timedelta(days=i)
new_row_period = new_row_period[['task_id','start_date','end_date',
'end_period','date','status']]
#appends the rows created to new dataframe
new_frame2 = new_frame2.append(new_row_period)
#Rearranges columns in the desired order
new_frame = new_frame[['task_id','start_date','end_date','end_period','date','status']]
#Changes data types
new_frame = new_frame.astype(dtype={'task_id' : int,'status' : bool})
#Merges both dataframes
frames = [new_frame1,new_frame2]
df = pd.concat(frames)
Then it throws an error when starts the first for loop, here is where I should be asking help on how to calculate the difference in days between the start_date and end_date columns if the status column is False, else calculate it between start_date and end_period columns.
The complete error is as follows:
Some part of your code did not work on my machine (so I just took the initial df from your first cell) - but when reading what you need, this is what I would do
import numpy as np
df['dayDiff']=np.where(df['status'],(df['end_period']-df['start_date']).dt.days,(df['end_date']-df['start_date']).dt.days)
df
As you already have booleand on df['status'], I would use that to the np.where condition , then either calculate the day difference df['end_period']-df['start_date']).dt.days when True either day difference (df['end_date']-df['start_date']).dt.days when False

Python3 to speed up the computing of dataframe

I have a dataframe (df) as following
id date t_slot dayofweek label
1 2021-01-01 2 0 1
1 2021-01-02 3 1 0
2 2021-01-01 4 6 1
.......
The data frame is very large(6 million rows). the t_slot is from 1 to 6 value. dayofweek is from 0-6.
I want to get the rate:
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
I have used loop to compute the rate, but it is very slow, do you have fast way to compute it. My code is copied as following:
def get_time_slot_rate(df):
import numpy as np
if len(df)==0:
return np.nan, np.nan, np.nan, np.nan
else:
work = df.loc[df['dayofweek']<5]
weekend = df.loc[df['dayofweek']>=5]
if len(work)==0:
work_14, work_56 = np.nan, np.nan
else:
work_14 = len(work.loc[(work['time_slot']<5)*(work['label']==1)])/len(work)
work_56 = len(work.loc[(work['time_slot']>5)*(work['label']==1)])/len(work)
if len(weekend)==0:
weekend_14, weekend_56 = np.nan, np.nan
else:
weekend_14 = len(weekend.loc[(weekend['time_slot']<5)*(weekend['label']==1)])/len(weekend)
weekend_56 = len(weekend.loc[(weekend['time_slot']>5)*(weekend['label']==1)])/len(weekend)
return work_14, work_56, weekend_14, weekend_56
import datetime as d_t
lst_id = list(df['id'])
lst_date = list(df['date'])
lst_t14_work = []
lst_t56_work = []
lst_t14_weekend = []
lst_t56_weekend = []
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = lst_date[i]
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& ((df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date))].reset_index(drop=True)
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df_s)
lst_t14_work.append(work_14_rate)
lst_t56_work.append(work_56_rate)
lst_t14_weekend.append(weekend_14_rate)
lst_t56_weekend.append(weekend_56_rate)
I could only fix your function and it's completely untested, but here we go:
Import only once by putting the imports at the top of your .py.
try/except blocks are more efficient than if/else statements.
True and False equals to 1 and 0 respectively in Python.
Don't multiply boolean selectors and use the reverse operator ~
Create the least amount of copies.
import numpy as np
def get_time_slot_rate(df):
# much faster than counting
if df.empty:
return np.nan, np.nan, np.nan, np.nan
# assuming df['label'] is either 0 or 1
df = df.loc[df['label']]
# create boolean selectors to be inverted with '~'
weekdays = df['dayofweek']<=5
slot_selector = df['time_slot']<=5
weekday_count = np.sum(weekdays)
try:
work_14 = len(df.loc[weekdays & slot_selector])/weekday_count
work_56 = len(df.loc[weekdays & ~slot_selector])/weekday_count
except ZeroDivisionError:
work_14 = work_56 = np.nan
weekend_count = np.sum(~weekdays)
try:
weekend_14 = len(df.loc[~weekdays & slot_selector])/weekend_count
weekend_56 = len(df.loc[~weekdays & ~slot_selector])/weekend_count
except ZeroDivisionError:
weekend_14 = weekend_56 = np.nan
return work_14, work_56, weekend_14, weekend_56
The rest of your script doesn't really make sense, see my comments:
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = date[i]
# what is d_t ?
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& (df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date)].reset_index(drop=True)
# is it df or df_s ?
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df)
If your date column is a datetime object than you can compare dates directly (no need for strings).

What will be the best approach, apply function or for loop in pandas dataframe for replacing null values?

I have a dataset with 100000 records for which I need to replace null values based on multiple columns.
I have tried with two approach:
#First Approach
# Missing value treatment
start_time = time.time()
data['date_of_last_rech_data_6'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_6','total_rech_data_6','max_rech_data_6']))) else x['date_of_last_rech_data_6'], axis = 1)
data['total_rech_data_6'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_6','total_rech_data_6','max_rech_data_6']))) else x['total_rech_data_6'], axis = 1)
data['max_rech_data_6'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_6','total_rech_data_6','max_rech_data_6']))) else x['max_rech_data_6'], axis = 1)
data['date_of_last_rech_data_7'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_7','total_rech_data_7','max_rech_data_7']))) else x['date_of_last_rech_data_7'], axis = 1)
data['total_rech_data_7'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_7','total_rech_data_7','max_rech_data_7']))) else x['total_rech_data_7'], axis = 1)
data['max_rech_data_7'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_7','total_rech_data_7','max_rech_data_7']))) else x['max_rech_data_7'], axis = 1)
data['date_of_last_rech_data_8'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_8','total_rech_data_8','max_rech_data_8']))) else x['date_of_last_rech_data_8'], axis = 1)
data['total_rech_data_8'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_8','total_rech_data_8','max_rech_data_8']))) else x['total_rech_data_8'], axis = 1)
data['max_rech_data_8'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_8','total_rech_data_8','max_rech_data_8']))) else x['max_rech_data_8'], axis = 1)
data['date_of_last_rech_data_9'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_9','total_rech_data_9','max_rech_data_9']))) else x['date_of_last_rech_data_9'], axis = 1)
data['total_rech_data_9'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_9','total_rech_data_9','max_rech_data_9']))) else x['total_rech_data_9'], axis = 1)
data['max_rech_data_9'] = data.apply(lambda x: 0 if(np.all(pd.isnull(['date_of_last_rech_data_9','total_rech_data_9','max_rech_data_9']))) else x['max_rech_data_9'], axis = 1)
end_time = time.time()
print(end_time-start_time)
Time taken by this snippet is 152.52092480659485.
#Second Approach
start_time = time.time()
for i in range(0,len(data)):
# Missing value treatment for the month of June
if pd.isnull((data['date_of_last_rech_data_6'][i]) and (data['total_rech_data_6'][i]) and (data['max_rech_data_6'][i]) ):
data['date_of_last_rech_data_6'][i]=0
data['total_rech_data_6'][i]=0
data['max_rech_data_6'][i]=0
# Missing value treatment for the month of July
if pd.isnull((data['date_of_last_rech_data_7'][i]) and (data['total_rech_data_7'][i]) and (data['max_rech_data_7'][i]) ):
data['date_of_last_rech_data_7'][i]=0
data['total_rech_data_7'][i]=0
data['max_rech_data_7'][i]=0
# Missing value treatment for the month of August
if pd.isnull((data['date_of_last_rech_data_8'][i]) and (data['total_rech_data_8'][i]) and (data['max_rech_data_8'][i]) ):
data['date_of_last_rech_data_8'][i]=0
data['total_rech_data_8'][i]=0
data['max_rech_data_8'][i]=0
# Missing value treatment for the month of September
if pd.isnull((data['date_of_last_rech_data_9'][i]) and (data['total_rech_data_9'][i]) and (data['max_rech_data_9'][i]) ):
data['date_of_last_rech_data_9'][i]=0
data['total_rech_data_9'][i]=0
data['max_rech_data_9'][i]=0
end_time = time.time()
print(end_time-start_time)
Time taken by this code 223.60794281959534. But this code sometime runs and sometime just hangs and stops the kernel.
Is there any other best approach to do this?

pandas merge two dataframe to form a multiindex

I'm playing around with Pandas to see if I can do some stock calculation better/faster than with other tools. If I have a single stock it's easy to create daily calculation L
df['mystuff'] = df['Close']+1
If I download more than a ticker it gets complicated:
df = df.stack()
df['mystuff'] = df['Close']+1
df = df.unstack()
If I want to use prevous' day "Close" it gets too complex for me. I thought I might go back to fetch a single ticker, do any operation with iloc[i-1] or something similar (I haven't figured it yet) and then merge the dataframes.
How do I merget two dataframes of single tickers to have a multiindex?
So that:
f1 = web.DataReader('AAPL', 'yahoo', start, end)
f2 = web.DataReader('GOOG', 'yahoo', start, end)
is like
f = web.DataReader(['AAPL','GOOG'], 'yahoo', start, end)
Edit:
This is the nearest thing to f I can create. It's not exactly the same so I'm not sure I can use it instead of f.
f_f = pd.concat(['AAPL':f1,'GOOG':f2},axis=1)
Maybe I should experiment with operations working on a multiindex instead of splitting work on simpler dataframes.
Full Code:
import pandas_datareader.data as web
import pandas as pd
from datetime import datetime
start = datetime(2001, 9, 1)
end = datetime(2019, 8, 31)
a = web.DataReader('AAPL', 'yahoo', start, end)
g = web.DataReader('GOOG', 'yahoo', start, end)
# here are shift/diff calculations that I don't knokw how to do with a multiindex
a_g = web.DataReader(['AAPL','GOOG'], 'yahoo', start, end)
merged = pd.concat({'AAPL':a,'GOOG':g},axis=1)
a_g.to_csv('ag.csv')
merged.to_csv('merged.csv')
import code; code.interact(local=locals())
side note: I don't know how to compare the two csv
This is not exactly the same but it returns Multiindex you can use as in the a_g case
import pandas_datareader.data as web
import pandas as pd
from datetime import datetime
start = datetime(2019, 7, 1)
end = datetime(2019, 8, 31)
out = []
for tick in ["AAPL", "GOOG"]:
d = web.DataReader(tick, 'yahoo', start, end)
cols = [(col, tick) for col in d.columns]
d.columns = pd.MultiIndex\
.from_tuples(cols,
names=['Attributes', 'Symbols'] )
out.append(d)
df = pd.concat(out, axis=1)
Update
In case you want to calculate and add a new column in case you have multiindex columns you can follow this
import pandas_datareader.data as web
import pandas as pd
from datetime import datetime
start = datetime(2019, 7, 1)
end = datetime(2019, 8, 31)
ticks = ['AAPL','GOOG']
df = web.DataReader(ticks, 'yahoo', start, end)
names = list(df.columns.names)
df1 = df["Close"].shift()
cols = [("New", col) for col in df1.columns]
df1.columns = pd.MultiIndex.from_tuples(cols,
names=names)
df = df.join(df1)

Group by based on an if statement

I have a df that contains ids and timestamps.
I was looking to group by the id and then a condition on the timestamp in the two rows.
Something like if timestamp_col1 > timestamp_col1 for the second row then 1 else 2
Basically grouping the ids and an if statement to give a value of 1 if the first row timestamp is < than the second and 2 if the second row timestamp is < then the first
Updated output below where last two values should be 2
Use to_timedelta for converting times, then aggregate difference between first and last value and compare by gt (>), last map with numpy.where for assign new column:
df = pd.DataFrame({
'ID Code': ['a','a','b','b'],
'Time Created': ['21:25:27','21:12:09','21:12:00','21:12:40']
})
df['Time Created'] = pd.to_timedelta(df['Time Created'])
mask = df.groupby('ID Code')['Time Created'].agg(lambda x: x.iat[0] < x.iat[-1])
print (mask)
ID Code
a True
b False
Name: Time Created, dtype: bool
df['new'] = np.where(df['ID Code'].map(mask), 1, 2)
print (df)
ID Code Time Created new
0 a 21:25:27 2
1 a 21:12:09 2
2 b 21:12:00 1
3 b 21:12:40 1
Another solution with transform for return aggregate value to new column, here boolean mask:
df['Time Created'] = pd.to_timedelta(df['Time Created'])
mask = (df.groupby('ID Code')['Time Created'].transform(lambda x: x.iat[0] > x.iat[-1]))
print (mask)
0 True
1 True
2 False
3 False
Name: Time Created, dtype: bool
df['new'] = np.where(mask, 2, 1)
print (df)
ID Code Time Created new
0 a 21:25:27 2
1 a 21:12:09 2
2 b 21:12:00 1
3 b 21:12:40 1