Python3 to speed up the computing of dataframe

Python3 to speed up the computing of dataframe - pandas

I have a dataframe (df) as following
id date t_slot dayofweek label
1 2021-01-01 2 0 1
1 2021-01-02 3 1 0
2 2021-01-01 4 6 1
.......
The data frame is very large(6 million rows). the t_slot is from 1 to 6 value. dayofweek is from 0-6.
I want to get the rate:
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 1 to 4, and dayofweek is 0-4 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
- the each id's rate about the label is 1 rate when the t_slot is 5 to 6, and dayofweek is 5-6 in the past 3 months before the date in each row.
I have used loop to compute the rate, but it is very slow, do you have fast way to compute it. My code is copied as following:
def get_time_slot_rate(df):
import numpy as np
if len(df)==0:
return np.nan, np.nan, np.nan, np.nan
else:
work = df.loc[df['dayofweek']<5]
weekend = df.loc[df['dayofweek']>=5]
if len(work)==0:
work_14, work_56 = np.nan, np.nan
else:
work_14 = len(work.loc[(work['time_slot']<5)*(work['label']==1)])/len(work)
work_56 = len(work.loc[(work['time_slot']>5)*(work['label']==1)])/len(work)
if len(weekend)==0:
weekend_14, weekend_56 = np.nan, np.nan
else:
weekend_14 = len(weekend.loc[(weekend['time_slot']<5)*(weekend['label']==1)])/len(weekend)
weekend_56 = len(weekend.loc[(weekend['time_slot']>5)*(weekend['label']==1)])/len(weekend)
return work_14, work_56, weekend_14, weekend_56
import datetime as d_t
lst_id = list(df['id'])
lst_date = list(df['date'])
lst_t14_work = []
lst_t56_work = []
lst_t14_weekend = []
lst_t56_weekend = []
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = lst_date[i]
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& ((df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date))].reset_index(drop=True)
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df_s)
lst_t14_work.append(work_14_rate)
lst_t56_work.append(work_56_rate)
lst_t14_weekend.append(weekend_14_rate)
lst_t56_weekend.append(weekend_56_rate)

I could only fix your function and it's completely untested, but here we go:
Import only once by putting the imports at the top of your .py.
try/except blocks are more efficient than if/else statements.
True and False equals to 1 and 0 respectively in Python.
Don't multiply boolean selectors and use the reverse operator ~
Create the least amount of copies.
import numpy as np
def get_time_slot_rate(df):
# much faster than counting
if df.empty:
return np.nan, np.nan, np.nan, np.nan
# assuming df['label'] is either 0 or 1
df = df.loc[df['label']]
# create boolean selectors to be inverted with '~'
weekdays = df['dayofweek']<=5
slot_selector = df['time_slot']<=5
weekday_count = np.sum(weekdays)
try:
work_14 = len(df.loc[weekdays & slot_selector])/weekday_count
work_56 = len(df.loc[weekdays & ~slot_selector])/weekday_count
except ZeroDivisionError:
work_14 = work_56 = np.nan
weekend_count = np.sum(~weekdays)
try:
weekend_14 = len(df.loc[~weekdays & slot_selector])/weekend_count
weekend_56 = len(df.loc[~weekdays & ~slot_selector])/weekend_count
except ZeroDivisionError:
weekend_14 = weekend_56 = np.nan
return work_14, work_56, weekend_14, weekend_56
The rest of your script doesn't really make sense, see my comments:
for i in range(len(lst_id)):
if i%100==0:
print(i)
d_date = date[i]
# what is d_t ?
dt = d_t.datetime.strptime(d_date, '%Y-%m-%d')
month_step = relativedelta(months=3)
pre_date = str(dt - month_step).split(' ')[0]
df_s = df.loc[(df['easy_id']==lst_easy[i])
& (df['delivery_date']>=pre_date)
&(df['delivery_date']< d_date)].reset_index(drop=True)
# is it df or df_s ?
work_14_rate, work_56_rate, weekend_14_rate, weekend_56_rate = get_time_slot_rate(df)
If your date column is a datetime object than you can compare dates directly (no need for strings).

Related

set all even saturdays of every month to zero

I have a dataframe with date column and a holiday column
I have been trying to set even saturdays to 0. For example if a month has 5 saturdays - 2nd and 4th saturday must be set to 0 and other saturdays must be 1.
Code I tried:
import pandas as pd
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df['holiday'] = 1
def is_weekend(d):
if d.weekday() == 6:
return 0
elif d.weekday() == 5 and (d.day-1)//7%2 == 1:
return 0
else:
return 1
df['holiday'] = df.apply(lambda row: is_weekend(row['date']) if row['holiday'] == 1 else 0, axis=1)
so here, all sundays are set to 0 since its a holiday and i tried setting every even saturday to 0. But For example, 30.07.2022 must be 1 (5th saturday) but its being set to 0 with my code.
How can I get only even saturdays as 0?

you can use these codes
start_date = '2022-01-01'
end_date = '2022-12-31'
dates = pd.date_range(start_date, end_date)
df = pd.DataFrame({'date': dates})
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index()
df["day"] = df["date"].dt.day_name()
df["holiday"] = df.apply(lambda x: 0 if any([x["day"]=="Sunday",(x["index"]%2==1 and x["day"]=="Saturday")]) else 1, axis=1)
df = df.drop(["index","day"], axis=1)
df

Subset two consecutive event occurrence in pandas

I'm trying to get a subset of my data whenever there is consecutive occurrence of an two events in that order. The event is time-stamped. So every time there are continuous 2's and then continuous 3's, I want to subset that to a dataframe and append it to a dictionary. The following code does that but I have to apply this to a very large dataframe of more than 20 mil obs. This is extremely slow using iterrows. How can I make this fast?
df = pd.DataFrame({'Date': [101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122],
'Event': [1,1,2,2,2,3,3,1,3,2,2,3,1,2,3,2,3,2,2,3,3,3]})
dfb = pd.DataFrame(columns = df.columns)
C = {}
f1 = 0
for index, row in df.iterrows():
if ((row['Event'] == 2) & (3 not in dfb['Event'].values)):
dfb = dfb.append(row)
f1 =1
elif ((row['Event'] == 3) & (f1 == 1)):
dfb = dfb.append(row)
elif 3 in dfb['Event'].values:
f1 = 0
C[str(dfb.iloc[0,0])] = dfb
del dfb
dfb = pd.DataFrame(columns = df.columns)
if row['Event'] == 2:
dfb = dfb.append(row)
f1 =1
else:
f1=0
del dfb
dfb = pd.DataFrame(columns = df.columns)
Edit: The desired output is basically a dictionary of the subsets shown in the imagehttps://i.stack.imgur.com/ClWZs.png

If you want to accerlate, you should vectorize your code. You could try it like this (df is the same with your code):
vec = df.copy()
vec['Event_y'] = vec['Event'].shift(1).fillna(0).astype(int)
vec['Same_Flag'] = float('nan')
vec.Same_Flag.loc[(vec['Event_y'] == vec['Event']) & (vec['Event'] != 1)] = 1
vec.dropna(inplace=True)
vec.loc[:, ('Date', 'Event')]
Output is:
Date Event
3 104 2
4 105 2
6 107 3
10 111 2
18 119 2
20 121 3
21 122 3
I think that's close to what you need. You could improve based on that.
I'm not understand why date 104, 105, 107 are not counted.

Getting count of rows from breakpoints of different column

Consider there are two columns A and B in a dataframe. How can I decile column A and use those breakpoints of column A deciles to calculate the count of rows in column B??
import pandas as pd
import numpy as np
df=pd.read_excel("E:\Sai\Development\UCG\qcut.xlsx")
df['Range']=pd.qcut(df['a'],10)
df_gb=df.groupby('Range',as_index=False).agg({'a':[min,max,np.size]})
df_gb.columns = df_gb.columns.droplevel()
df_gb=df_gb.rename(columns={'':'Range','size':'count_A'})
df['Range_B']=0
df['Range_B'].loc[df['b']<=df_gb['max'][0]]=1
df['Range_B'].loc[(df['b']>df_gb['max'][0]) & (df['b']<=df_gb['max'][1])]=2
df['Range_B'].loc[(df['b']>df_gb['max'][1]) & (df['b']<=df_gb['max'][2])]=3
df['Range_B'].loc[(df['b']>df_gb['max'][2]) & (df['b']<=df_gb['max'][3])]=4
df['Range_B'].loc[(df['b']>df_gb['max'][3]) & (df['b']<=df_gb['max'][4])]=5
df['Range_B'].loc[(df['b']>df_gb['max'][4]) & (df['b']<=df_gb['max'][5])]=6
df['Range_B'].loc[(df['b']>df_gb['max'][5]) & (df['b']<=df_gb['max'][6])]=7
df['Range_B'].loc[(df['b']>df_gb['max'][6]) & (df['b']<=df_gb['max'][7])]=8
df['Range_B'].loc[(df['b']>df_gb['max'][7]) & (df['b']<=df_gb['max'][8])]=9
df['Range_B'].loc[df['b']>df_gb['max'][8]]=10
df_gb_b=df.groupby('Range_B',as_index=False).agg({'b':np.size})
df_gb_b=df_gb_b.rename(columns={'b':'count_B'})
df_final = pd.concat([df_gb, df_gb_b], axis=1)
df_final=df_final[['Range','count_A','count_B']]
Is there any simple solution, as I intend to do for so many columns

I hope this would help:
df['Range'] = pd.qcut(df['a'], 10)
df2 = df.groupby(['Range'])['a'].count().reset_index().rename(columns = {'a':'count_A'})
for item in df2['Range'].values:
df2.loc[df2['Range'] == item, 'count_B'] = df['b'].apply(lambda x: x in item).sum()
df2 = df2.sort_values('Range', ascending = True)
if you want to additionally count values b that are out of range a:
min_border = df2['Range'].values[0].left
max_border = df2['Range'].values[-1].right
df2.loc[0, 'count_B'] += df.loc[df['b'] <= min_border, 'b'].count()
df2.iloc[-1, 2] += df.loc[df['b'] > max_border, 'b'].count()

One way -
df = pd.DataFrame({'A': np.random.randint(0, 100, 20), 'B': np.random.randint(0, 10, 20)})
bins = [0, 1, 4, 8, 16, 32, 60, 100, 200, 500, 5999]
labels = ["{0} - {1}".format(i, j) for i, j in zip(bins, bins[1:])]
df['group_A'] = pd.cut(df['A'], bins, right=False, labels=labels)
df['group_B'] = pd.cut(df.B, bins, right=False, labels=labels)
df1 = df.groupby(['group_A'])['A'].count().reset_index()
df2 = df.groupby(['group_B'])['B'].count().reset_index()
df_final = pd.merge(df1, df2, left_on =['group_A'], right_on =['group_B']).drop(['group_B'], axis=1).rename(columns={'group_A': 'group'})
print(df_final)
Output
group A B
0 0 - 1 0 1
1 1 - 4 1 3
2 4 - 8 1 9
3 8 - 16 2 7
4 16 - 32 3 0
5 32 - 60 7 0
6 60 - 100 6 0
7 100 - 200 0 0
8 200 - 500 0 0
9 500 - 5999 0 0

Pandas - Find and index rows that match row sequence pattern

I would like to find a pattern in a dataframe in a categorical variable going down rows. I can see how to use Series.shift() to look up / down and using boolean logic to find the pattern, however, I want to do this with a grouping variable and also label all rows that are part of the pattern, not just the starting row.
Code:
import pandas as pd
from numpy.random import choice, randn
import string
# df constructor
n_rows = 1000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
'group_var': choice(list(string.ascii_uppercase), n_rows),
'row_pat': choice([0, 1, 2, 3], n_rows),
'values': randn(n_rows)})
# sorting
df.sort_values(by=['group_var', 'date_time'], inplace=True)
df.head(10)
Which returns this:
I can find the start of the pattern (with no grouping though) by this:
# the row ordinal pattern to detect
p0, p1, p2, p3 = 1, 2, 2, 0
# flag the row at the start of the pattern
df['pat_flag'] = \
df['row_pat'].eq(p0) & \
df['row_pat'].shift(-1).eq(p1) & \
df['row_pat'].shift(-2).eq(p2) & \
df['row_pat'].shift(-3).eq(p3)
df.head(10)
What i cant figure out, is how to do this only withing the "group_var", and instead of returning True for the start of the pattern, return true for all rows that are part of the pattern.
Appreciate any tips on how to solve this!
Thanks...

I think you have 2 ways - simplier and slowier solution or faster complicated.
use Rolling.apply and test pattern
replace 0s to NaNs by mask
use bfill with limit (same as fillna with method='bfill') for repeat 1
then fillna NaNs to 0
last cast to bool by astype
pat = np.asarray([1, 2, 2, 0])
N = len(pat)
df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
.apply(lambda x: (x==pat).all())
.mask(lambda x: x == 0)
.bfill(limit=N-1)
.fillna(0)
.astype(bool)
)
If is important performance, use strides, solution from link was modify:
use rolling window approach
compare with pattaern and return Trues for match by all
get indices of first occurencies by np.mgrid and indexing
create all indices with list comprehension
compare by numpy.in1d and create new column
def rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
return c
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]
d = [i for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)
Another solution, thanks #divakar:
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
Timings:
np.random.seed(456)
import pandas as pd
from numpy.random import choice, randn
from scipy.ndimage.morphology import binary_dilation
import string
# df constructor
n_rows = 100000
df = pd.DataFrame({'date_time': pd.date_range('2/9/2018', periods=n_rows, freq='H'),
'group_var': choice(list(string.ascii_uppercase), n_rows),
'row_pat': choice([0, 1, 2, 3], n_rows),
'values': randn(n_rows)})
# sorting
df.sort_values(by=['group_var', 'date_time'], inplace=True)
def rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
c = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
return c
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
m = (rolling_window(arr, len(pat)) == pat).all(1)
m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
arr = df['row_pat'].values
b = np.all(rolling_window(arr, N) == pat, axis=1)
c = np.mgrid[0:len(b)][b]
d = [i for x in c for i in range(x, x+N)]
df['rm2'] = np.in1d(np.arange(len(arr)), d)
print (df.iloc[460:480])
date_time group_var row_pat values rm0 rm1 rm2
12045 2019-06-25 21:00:00 A 3 -0.081152 False False False
12094 2019-06-27 22:00:00 A 1 -0.818167 False False False
12125 2019-06-29 05:00:00 A 0 -0.051088 False False False
12143 2019-06-29 23:00:00 A 0 -0.937589 False False False
12145 2019-06-30 01:00:00 A 3 0.298460 False False False
12158 2019-06-30 14:00:00 A 1 0.647161 False False False
12164 2019-06-30 20:00:00 A 3 -0.735538 False False False
12210 2019-07-02 18:00:00 A 1 -0.881740 False False False
12341 2019-07-08 05:00:00 A 3 0.525652 False False False
12343 2019-07-08 07:00:00 A 1 0.311598 False False False
12358 2019-07-08 22:00:00 A 1 -0.710150 True True True
12360 2019-07-09 00:00:00 A 2 -0.752216 True True True
12400 2019-07-10 16:00:00 A 2 -0.205122 True True True
12404 2019-07-10 20:00:00 A 0 1.342591 True True True
12413 2019-07-11 05:00:00 A 1 1.707748 False False False
12506 2019-07-15 02:00:00 A 2 0.319227 False False False
12527 2019-07-15 23:00:00 A 3 2.130917 False False False
12600 2019-07-19 00:00:00 A 1 -1.314070 False False False
12604 2019-07-19 04:00:00 A 0 0.869059 False False False
12613 2019-07-19 13:00:00 A 2 1.342101 False False False
In [225]: %%timeit
...: df['rm0'] = (df['row_pat'].rolling(window=N , min_periods=N)
...: .apply(lambda x: (x==pat).all())
...: .mask(lambda x: x == 0)
...: .bfill(limit=N-1)
...: .fillna(0)
...: .astype(bool)
...: )
...:
1 loop, best of 3: 356 ms per loop
In [226]: %%timeit
...: arr = df['row_pat'].values
...: b = np.all(rolling_window(arr, N) == pat, axis=1)
...: c = np.mgrid[0:len(b)][b]
...: d = [i for x in c for i in range(x, x+N)]
...: df['rm2'] = np.in1d(np.arange(len(arr)), d)
...:
100 loops, best of 3: 7.63 ms per loop
In [227]: %%timeit
...: arr = df['row_pat'].values
...: b = np.all(rolling_window(arr, N) == pat, axis=1)
...:
...: m = (rolling_window(arr, len(pat)) == pat).all(1)
...: m_ext = np.r_[m,np.zeros(len(arr) - len(m), dtype=bool)]
...: df['rm1'] = binary_dilation(m_ext, structure=[1]*N, origin=-(N//2))
...:
100 loops, best of 3: 7.25 ms per loop

You could make use of the pd.rolling() methods and then simply compare the arrays that it returns with the array that contains the pattern that you are attempting to match on.
pattern = np.asarray([1.0, 2.0, 2.0, 0.0])
n_obs = len(pattern)
df['rolling_match'] = (df['row_pat']
.rolling(window=n_obs , min_periods=n_obs)
.apply(lambda x: (x==pattern).all())
.astype(bool) # All as bools
.shift(-1 * (n_obs - 1)) # Shift back
.fillna(False) # convert NaNs to False
)
It is important to specify the min periods here in order to ensure that you only find exact matches (and so the equality check won't fail when the shapes are misaligned). The apply function is doing a pairwise check between the two arrays, and then we use the .all() to ensure all match. We convert to a bool, and then call shift on the function to move it to being a 'forward looking' indicator instead of only occurring after the fact.
Help on the rolling functionality available here -
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rolling.html

This works.
It works like this:
a) For every group, it takes a window of size 4 and scans through the column until it finds the combination (1,2,2,0) in exact sequence. As soon as it finds the sequence, it populates the corresponding index values of new column 'pat_flag' with 1.
b) If it doesn't find the combination, it populates the column with 0.
pattern = [1,2,2,0]
def get_pattern(df):
df = df.reset_index(drop=True)
df['pat_flag'] = 0
get_indexes = []
temp = []
for index, row in df.iterrows():
mindex = index +1
# get the next 4 values
for j in range(mindex, mindex+4):
if j == df.shape[0]:
break
else:
get_indexes.append(j)
temp.append(df.loc[j,'row_pat'])
# check if sequence is matched
if temp == pattern:
df.loc[get_indexes,'pat_flag'] = 1
else:
# reset if the pattern is not found in given window
temp = []
get_indexes = []
return df
# apply function to the groups
df = df.groupby('group_var').apply(get_pattern)
## snippet of output
date_time group_var row_pat values pat_flag
41 2018-03-13 21:00:00 C 3 0.731114 0
42 2018-03-14 05:00:00 C 0 1.350164 0
43 2018-03-14 11:00:00 C 1 -0.429754 1
44 2018-03-14 12:00:00 C 2 1.238879 1
45 2018-03-15 17:00:00 C 2 -0.739192 1
46 2018-03-18 06:00:00 C 0 0.806509 1
47 2018-03-20 06:00:00 C 1 0.065105 0
48 2018-03-20 08:00:00 C 1 0.004336 0

Expanding on Emmet02's answer: using the rolling function for all groups and setting match-column to 1 for all matching pattern indices:
pattern = np.asarray([1,2,2,0])
# Create a match column in the main dataframe
df.assign(match=False, inplace=True)
for group_var, group in df.groupby("group_var"):
# Per group do rolling window matching, the last
# values of matching patterns in array 'match'
# will be True
match = (
group['row_pat']
.rolling(window=len(pattern), min_periods=len(pattern))
.apply(lambda x: (x==pattern).all())
)
# Get indices of matches in current group
idx = np.arange(len(group))[match == True]
# Include all indices of matching pattern,
# counting back from last index in pattern
idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx))
# Update matches
match.values[idx] = True
df.loc[group.index, 'match'] = match
df[df.match==True]
edit: Without a for loop
# Do rolling matching in group clause
match = (
df.groupby("group_var")
.rolling(len(pattern))
.row_pat.apply(lambda x: (x==pattern).all())
)
# Convert NaNs
match = (~match.isnull() & match)
# Get indices of matches in current group
idx = np.arange(len(df))[match]
# Include all indices of matching pattern
idx = idx.repeat(len(pattern)) - np.tile(np.arange(len(pattern)), len(idx))
# Mark all indices that are selected by "idx" in match-column
df = df.assign(match=df.index.isin(df.index[idx]))

You can do this by defining a custom aggregate function, then using it in group_by statement, finally merge it back to the original dataframe. Something like this:
Aggregate function:
def pattern_detect(column):
# define any other pattern to detect here
p0, p1, p2, p3 = 1, 2, 2, 0
column.eq(p0) & \
column.shift(-1).eq(p1) & \
column.shift(-2).eq(p2) & \
column.shift(-3).eq(p3)
return column.any()
Use group by function next:
grp = df.group_by('group_var').agg([patter_detect])['row_pat']
Now merge it back to the original dataframe:
df = df.merge(grp, left_on='group_var',right_index=True, how='left')

Writing a function that is summing up certain values in a row in a pandas dataframe

I have a pandas DataFrame and I would like to write a function that helps me to sum up every negative values as i.e. result1, and every positive values as result2. So basically, this function should iterate over the column "total_load"
def total_battery(ok6, col_name='total_load'):
"""Return a dictionary with counts of occurrences."""
# Initialize an empty dictionary: cols_values
cols_values = {}
# Extract column from df: col
col = ok6[col_name]
# iterate over the column in df
for entry in col:
if entry in cols_values.keys() < 0: ***--> then sum all the negative values***
cols_values[entry] += sum
else:
if entry in cols_values.keys() > 0: ***--> then sum all the negative values***
cols_values[entry] += sum
# Return the cols_count dictionary
return cols_values
# Call count_entries(): result1
result1 = total_battery(ok6, "total_load")
# Call count_entries(): result2
result2 = total_battery(ok6, "total_load")
# Print result1 and result2
print(result1)
print(result2)

Use boolean indexing or query for filtering and then Series.sum:
result1 = df.loc[df['total_load'] < 0, 'total_load'].sum()
result2 = df.loc[df['total_load'] > 0, 'total_load'].sum()
result1 = df.query('total_load < 0')['total_load'].sum()
result2 = df.query('total_load > 0')['total_load'].sum()
Sample:
rng = pd.date_range('2016-06-01', periods=4, freq='T')
df = pd.DataFrame({'total_load':[1,2,-3,-5]}, index=rng)
print (df)
total_load
2016-06-01 00:00:00 1
2016-06-01 00:01:00 2
2016-06-01 00:02:00 -3
2016-06-01 00:03:00 -5
result1 = df.loc[df['total_load'] < 0, 'total_load'].sum()
result2 = df.loc[df['total_load'] > 0, 'total_load'].sum()
print (result1)
-8
print (result2)
3
result1 = df.query('total_load < 0')['total_load'].sum()
result2 = df.query('total_load > 0')['total_load'].sum()
print (result1)
-8
print (result2)
3

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Python3 to speed up the computing of dataframe - pandas

Related

set all even saturdays of every month to zero

Subset two consecutive event occurrence in pandas

Getting count of rows from breakpoints of different column

Pandas - Find and index rows that match row sequence pattern

Writing a function that is summing up certain values in a row in a pandas dataframe

Categories

Resources