I have a dataframe with date column and a holiday column
I have been trying to set even saturdays to 0. For example if a month has 5 saturdays - 2nd and 4th saturday must be set to 0 and other saturdays must be 1.
Code I tried:
import pandas as pd
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df['holiday'] = 1
def is_weekend(d):
if d.weekday() == 6:
return 0
elif d.weekday() == 5 and (d.day-1)//7%2 == 1:
return 0
else:
return 1
df['holiday'] = df.apply(lambda row: is_weekend(row['date']) if row['holiday'] == 1 else 0, axis=1)
so here, all sundays are set to 0 since its a holiday and i tried setting every even saturday to 0. But For example, 30.07.2022 must be 1 (5th saturday) but its being set to 0 with my code.
How can I get only even saturdays as 0?
you can use these codes
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index()
df["day"] = df["date"].dt.day_name()
df["holiday"] = df.apply(lambda x: 0 if any([x["day"]=="Sunday",(x["index"]%2==1 and x["day"]=="Saturday")]) else 1, axis=1)
df = df.drop(["index","day"], axis=1)
df
I'm trying to get a subset of my data whenever there is consecutive occurrence of an two events in that order. The event is time-stamped. So every time there are continuous 2's and then continuous 3's, I want to subset that to a dataframe and append it to a dictionary. The following code does that but I have to apply this to a very large dataframe of more than 20 mil obs. This is extremely slow using iterrows. How can I make this fast?
df = pd.DataFrame({'Date': [101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122],
'Event': [1,1,2,2,2,3,3,1,3,2,2,3,1,2,3,2,3,2,2,3,3,3]})
dfb = pd.DataFrame(columns = df.columns)
C = {}
f1 = 0
for index, row in df.iterrows():
if ((row['Event'] == 2) & (3 not in dfb['Event'].values)):
dfb = dfb.append(row)
f1 =1
elif ((row['Event'] == 3) & (f1 == 1)):
dfb = dfb.append(row)
elif 3 in dfb['Event'].values:
f1 = 0
C[str(dfb.iloc[0,0])] = dfb
del dfb
dfb = pd.DataFrame(columns = df.columns)
if row['Event'] == 2:
dfb = dfb.append(row)
f1 =1
else:
f1=0
del dfb
dfb = pd.DataFrame(columns = df.columns)
Edit: The desired output is basically a dictionary of the subsets shown in the imagehttps://i.stack.imgur.com/ClWZs.png
If you want to accerlate, you should vectorize your code. You could try it like this (df is the same with your code):
vec = df.copy()
vec['Event_y'] = vec['Event'].shift(1).fillna(0).astype(int)
vec['Same_Flag'] = float('nan')
vec.Same_Flag.loc[(vec['Event_y'] == vec['Event']) & (vec['Event'] != 1)] = 1
vec.dropna(inplace=True)
vec.loc[:, ('Date', 'Event')]
Output is:
Date Event
3 104 2
4 105 2
6 107 3
10 111 2
18 119 2
20 121 3
21 122 3
I think that's close to what you need. You could improve based on that.
I'm not understand why date 104, 105, 107 are not counted.
Consider there are two columns A and B in a dataframe. How can I decile column A and use those breakpoints of column A deciles to calculate the count of rows in column B??
import pandas as pd
import numpy as np
df=pd.read_excel("E:\Sai\Development\UCG\qcut.xlsx")
df['Range']=pd.qcut(df['a'],10)
df_gb=df.groupby('Range',as_index=False).agg({'a':[min,max,np.size]})
df_gb.columns = df_gb.columns.droplevel()
df_gb=df_gb.rename(columns={'':'Range','size':'count_A'})
df['Range_B']=0
df['Range_B'].loc[df['b']<=df_gb['max'][0]]=1
df['Range_B'].loc[(df['b']>df_gb['max'][0]) & (df['b']<=df_gb['max'][1])]=2
df['Range_B'].loc[(df['b']>df_gb['max'][1]) & (df['b']<=df_gb['max'][2])]=3
df['Range_B'].loc[(df['b']>df_gb['max'][2]) & (df['b']<=df_gb['max'][3])]=4
df['Range_B'].loc[(df['b']>df_gb['max'][3]) & (df['b']<=df_gb['max'][4])]=5
df['Range_B'].loc[(df['b']>df_gb['max'][4]) & (df['b']<=df_gb['max'][5])]=6
df['Range_B'].loc[(df['b']>df_gb['max'][5]) & (df['b']<=df_gb['max'][6])]=7
df['Range_B'].loc[(df['b']>df_gb['max'][6]) & (df['b']<=df_gb['max'][7])]=8
df['Range_B'].loc[(df['b']>df_gb['max'][7]) & (df['b']<=df_gb['max'][8])]=9
df['Range_B'].loc[df['b']>df_gb['max'][8]]=10
df_gb_b=df.groupby('Range_B',as_index=False).agg({'b':np.size})
df_gb_b=df_gb_b.rename(columns={'b':'count_B'})
df_final = pd.concat([df_gb, df_gb_b], axis=1)
df_final=df_final[['Range','count_A','count_B']]
Is there any simple solution, as I intend to do for so many columns
I hope this would help:
df['Range'] = pd.qcut(df['a'], 10)
df2 = df.groupby(['Range'])['a'].count().reset_index().rename(columns = {'a':'count_A'})
for item in df2['Range'].values:
df2.loc[df2['Range'] == item, 'count_B'] = df['b'].apply(lambda x: x in item).sum()
df2 = df2.sort_values('Range', ascending = True)
if you want to additionally count values b that are out of range a:
min_border = df2['Range'].values[0].left
max_border = df2['Range'].values[-1].right
df2.loc[0, 'count_B'] += df.loc[df['b'] <= min_border, 'b'].count()
df2.iloc[-1, 2] += df.loc[df['b'] > max_border, 'b'].count()
One way -
df = pd.DataFrame({'A': np.random.randint(0, 100, 20), 'B': np.random.randint(0, 10, 20)})
bins = [0, 1, 4, 8, 16, 32, 60, 100, 200, 500, 5999]
labels = ["{0} - {1}".format(i, j) for i, j in zip(bins, bins[1:])]
df['group_A'] = pd.cut(df['A'], bins, right=False, labels=labels)
df['group_B'] = pd.cut(df.B, bins, right=False, labels=labels)
df1 = df.groupby(['group_A'])['A'].count().reset_index()
df2 = df.groupby(['group_B'])['B'].count().reset_index()
df_final = pd.merge(df1, df2, left_on =['group_A'], right_on =['group_B']).drop(['group_B'], axis=1).rename(columns={'group_A': 'group'})
print(df_final)
Output
group A B
0 0 - 1 0 1
1 1 - 4 1 3
2 4 - 8 1 9
3 8 - 16 2 7
4 16 - 32 3 0
5 32 - 60 7 0
6 60 - 100 6 0
7 100 - 200 0 0
8 200 - 500 0 0
9 500 - 5999 0 0
I would like to find a pattern in a dataframe in a categorical variable going down rows. I can see how to use Series.shift() to look up / down and using boolean logic to find the pattern, however, I want to do this with a grouping variable and also label all rows that are part of the pattern, not just the starting row.
Code:
import pandas as pd
from numpy.random import choice, randn
import string
# df constructor
n_rows = 1000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
'group_var': choice(list(string.ascii_uppercase), n_rows),
'row_pat': choice([0, 1, 2, 3], n_rows),
'values': randn(n_rows)})
# sorting
df.sort_values(by=['group_var', 'date_time'], inplace=True)
df.head(10)
Which returns this:
I can find the start of the pattern (with no grouping though) by this:
# the row ordinal pattern to detect
p0, p1, p2, p3 = 1, 2, 2, 0
# flag the row at the start of the pattern
df['pat_flag'] = \
df['row_pat'].eq(p0) & \
df['row_pat'].shift(-1).eq(p1) & \
df['row_pat'].shift(-2).eq(p2) & \
df['row_pat'].shift(-3).eq(p3)
df.head(10)
What i cant figure out, is how to do this only withing the "group_var", and instead of returning True for the start of the pattern, return true for all rows that are part of the pattern.
Appreciate any tips on how to solve this!
Thanks...
I think you have 2 ways - simplier and slowier solution or faster complicated.
use Rolling.apply and test pattern
replace 0s to NaNs by mask
use bfill with limit (same as fillna with method='bfill') for repeat 1
then fillna NaNs to 0
last cast to bool by astype
pat = np.asarray([1, 2, 2, 0])
N = len(pat)
df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
.apply(lambda x: (x==pat).all())
.mask(lambda x: x == 0)
.bfill(limit=N-1)
.fillna(0)
.astype(bool)
)
If is important performance, use strides, solution from link was modify:
use rolling window approach
compare with pattaern and return Trues for match by all
get indices of first occurencies by np.mgrid and indexing
create all indices with list comprehension
compare by numpy.in1d and create new column
def rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
return c
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]
d = [i for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)
Another solution, thanks #divakar:
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
Timings:
np.random.seed(456)
import pandas as pd
from numpy.random import choice, randn
from scipy.ndimage.morphology import binary_dilation
import string
# df constructor
n_rows = 100000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
'group_var': choice(list(string.ascii_uppercase), n_rows),
'row_pat': choice([0, 1, 2, 3], n_rows),
'values': randn(n_rows)})
# sorting
df.sort_values(by=['group_var', 'date_time'], inplace=True)
def rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
return c
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]
d = [i for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)
print (df.iloc[460:480])
date_time group_var row_pat values rm0 rm1 rm2
12045 2019-06-25 21:00:00 A 3 -0.081152 False False False
12094 2019-06-27 22:00:00 A 1 -0.818167 False False False
12125 2019-06-29 05:00:00 A 0 -0.051088 False False False
12143 2019-06-29 23:00:00 A 0 -0.937589 False False False
12145 2019-06-30 01:00:00 A 3 0.298460 False False False
12158 2019-06-30 14:00:00 A 1 0.647161 False False False
12164 2019-06-30 20:00:00 A 3 -0.735538 False False False
12210 2019-07-02 18:00:00 A 1 -0.881740 False False False
12341 2019-07-08 05:00:00 A 3 0.525652 False False False
12343 2019-07-08 07:00:00 A 1 0.311598 False False False
12358 2019-07-08 22:00:00 A 1 -0.710150 True True True
12360 2019-07-09 00:00:00 A 2 -0.752216 True True True
12400 2019-07-10 16:00:00 A 2 -0.205122 True True True
12404 2019-07-10 20:00:00 A 0 1.342591 True True True
12413 2019-07-11 05:00:00 A 1 1.707748 False False False
12506 2019-07-15 02:00:00 A 2 0.319227 False False False
12527 2019-07-15 23:00:00 A 3 2.130917 False False False
12600 2019-07-19 00:00:00 A 1 -1.314070 False False False
12604 2019-07-19 04:00:00 A 0 0.869059 False False False
12613 2019-07-19 13:00:00 A 2 1.342101 False False False
In [225]: %%timeit
...: df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
...: .apply(lambda x: (x==pat).all())
...: .mask(lambda x: x == 0)
...: .bfill(limit=N-1)
...: .fillna(0)
...: .astype(bool)
...: )
...:
1 loop, best of 3: 356 ms per loop
In [226]: %%timeit
...: arr = df['row_pat'].values
...: b = np.all(rolling_window(arr, N) == pat, axis=1)
...: c = np.mgrid[0:len(b)][b]
...: d = [i for x in c for i in range(x, x+N)]
...: df['rm2'] = np.in1d(np.arange(len(arr)), d)
...:
100 loops, best of 3: 7.63 ms per loop
In [227]: %%timeit
...: arr = df['row_pat'].values
...: b = np.all(rolling_window(arr, N) == pat, axis=1)
...:
...: m = (rolling_window(arr, len(pat)) == pat).all(1)
...: m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
...: df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
...:
100 loops, best of 3: 7.25 ms per loop
You could make use of the pd.rolling() methods and then simply compare the arrays that it returns with the array that contains the pattern that you are attempting to match on.
pattern = np.asarray([1.0, 2.0, 2.0, 0.0])
n_obs = len(pattern)
df['rolling_match'] = (df['row_pat']
.rolling(window=n_obs , min_periods=n_obs)
.apply(lambda x: (x==pattern).all())
.astype(bool) # All as bools
.shift(-1 * (n_obs - 1)) # Shift back
.fillna(False) # convert NaNs to False
)
It is important to specify the min periods here in order to ensure that you only find exact matches (and so the equality check won't fail when the shapes are misaligned). The apply function is doing a pairwise check between the two arrays, and then we use the .all() to ensure all match. We convert to a bool, and then call shift on the function to move it to being a 'forward looking' indicator instead of only occurring after the fact.
Help on the rolling functionality available here -
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rolling.html
This works.
It works like this:
a) For every group, it takes a window of size 4 and scans through the column until it finds the combination (1,2,2,0) in exact sequence. As soon as it finds the sequence, it populates the corresponding index values of new column 'pat_flag' with 1.
b) If it doesn't find the combination, it populates the column with 0.
pattern = [1,2,2,0]
def get_pattern(df):
df = df.reset_index(drop=True)
df['pat_flag'] = 0
get_indexes = []
temp = []
for index, row in df.iterrows():
mindex = index +1
# get the next 4 values
for j in range(mindex, mindex+4):
if j == df.shape[0]:
break
else:
get_indexes.append(j)
temp.append(df.loc[j,'row_pat'])
# check if sequence is matched
if temp == pattern:
df.loc[get_indexes,'pat_flag'] = 1
else:
# reset if the pattern is not found in given window
temp = []
get_indexes = []
return df
# apply function to the groups
df = df.groupby('group_var').apply(get_pattern)
## snippet of output
date_time group_var row_pat values pat_flag
41 2018-03-13 21:00:00 C 3 0.731114 0
42 2018-03-14 05:00:00 C 0 1.350164 0
43 2018-03-14 11:00:00 C 1 -0.429754 1
44 2018-03-14 12:00:00 C 2 1.238879 1
45 2018-03-15 17:00:00 C 2 -0.739192 1
46 2018-03-18 06:00:00 C 0 0.806509 1
47 2018-03-20 06:00:00 C 1 0.065105 0
48 2018-03-20 08:00:00 C 1 0.004336 0
Expanding on Emmet02's answer: using the rolling function for all groups and setting match-column to 1 for all matching pattern indices:
pattern = np.asarray([1,2,2,0])
# Create a match column in the main dataframe
df.assign(match=False, inplace=True)
for group_var, group in df.groupby("group_var"):
# Per group do rolling window matching, the last
# values of matching patterns in array 'match'
# will be True
match = (
group['row_pat']
.rolling(window=len(pattern), min_periods=len(pattern))
.apply(lambda x: (x==pattern).all())
)
# Get indices of matches in current group
idx = np.arange(len(group))[match == True]
# Include all indices of matching pattern,
# counting back from last index in pattern
idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx))
# Update matches
match.values[idx] = True
df.loc[group.index, 'match'] = match
df[df.match==True]
edit: Without a for loop
# Do rolling matching in group clause
match = (
df.groupby("group_var")
.rolling(len(pattern))
.row_pat.apply(lambda x: (x==pattern).all())
)
# Convert NaNs
match = (~match.isnull() & match)
# Get indices of matches in current group
idx = np.arange(len(df))[match]
# Include all indices of matching pattern
idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx))
# Mark all indices that are selected by "idx" in match-column
df = df.assign(match=df.index.isin(df.index[idx]))
You can do this by defining a custom aggregate function, then using it in group_by statement, finally merge it back to the original dataframe. Something like this:
Aggregate function:
def pattern_detect(column):
# define any other pattern to detect here
p0, p1, p2, p3 = 1, 2, 2, 0
column.eq(p0) & \
column.shift(-1).eq(p1) & \
column.shift(-2).eq(p2) & \
column.shift(-3).eq(p3)
return column.any()
Use group by function next:
grp = df.group_by('group_var').agg([patter_detect])['row_pat']
Now merge it back to the original dataframe:
df = df.merge(grp, left_on='group_var',right_index=True, how='left')
I have a pandas DataFrame and I would like to write a function that helps me to sum up every negative values as i.e. result1, and every positive values as result2. So basically, this function should iterate over the column "total_load"
def total_battery(ok6, col_name='total_load'):
"""Return a dictionary with counts of occurrences."""
# Initialize an empty dictionary: cols_values
cols_values = {}
# Extract column from df: col
col = ok6[col_name]
# iterate over the column in df
for entry in col:
if entry in cols_values.keys() < 0: ***--> then sum all the negative values***
cols_values[entry] += sum
else:
if entry in cols_values.keys() > 0: ***--> then sum all the negative values***
cols_values[entry] += sum
# Return the cols_count dictionary
return cols_values
# Call count_entries(): result1
result1 = total_battery(ok6, "total_load")
# Call count_entries(): result2
result2 = total_battery(ok6, "total_load")
# Print result1 and result2
print(result1)
print(result2)
Use boolean indexing or query for filtering and then Series.sum:
result1 = df.loc[df['total_load'] < 0, 'total_load'].sum()
result2 = df.loc[df['total_load'] > 0, 'total_load'].sum()
result1 = df.query('total_load < 0')['total_load'].sum()
result2 = df.query('total_load > 0')['total_load'].sum()
Sample:
rng = pd.date_range('2016-06-01', periods=4, freq='T')
df = pd.DataFrame({'total_load':[1,2,-3,-5]}, index=rng)
print (df)
total_load
2016-06-01 00:00:00 1
2016-06-01 00:01:00 2
2016-06-01 00:02:00 -3
2016-06-01 00:03:00 -5
result1 = df.loc[df['total_load'] < 0, 'total_load'].sum()
result2 = df.loc[df['total_load'] > 0, 'total_load'].sum()
print (result1)
-8
print (result2)
3
result1 = df.query('total_load < 0')['total_load'].sum()
result2 = df.query('total_load > 0')['total_load'].sum()
print (result1)
-8
print (result2)
3