pandas grooup by according to group of days of the week selected - pandas

I have this dataframe:
rng = pd.date_range(start='2018-01-01', end='2018-01-21')
rnd_values = np.random.rand(len(rng))+3
df = pd.DataFrame({'time':rng.to_list(),'value':rnd_values})
let's say that I want to group it according to the day of the week and compute the mean:
df['span'] = np.where((df['time'].dt.day_of_week <= 2 , 'Th-Sn', 'Mn-Wd')
df['wkno'] = df['time'].dt.isocalendar().week.shift(fill_value=0)
df.groupby(['wkno','span']).mean()
However, I would like to make this procedure more general.
Let's say that I define the following day is the week:
days=['Monday','Thursday']
Is there any option that allows me to do what I have done by using "days". I imagine that I have to compute the number of day between 'Monday','Thursday' and then I should use that number. What about the case when
days=['Monday','Thursday','Friday']
I was thinking to set-up a dictionary as:
days={'Monday':0,'Thursday':3,'Friday':4}
then
idays = list(days.values())[:]
How can I use now idays inside np.where? Indeed I have three interval.
Thanks

If you want to use more than one threshold you need np.searchsorted the resulting function would look something like
def groupby_daysspan_week(dfc,days):
df = dfc.copy()
day_to_dayofweek = {'Monday':0,'Tuesday':1,'Wednesday':2,
'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}
short_dict = {0:'Mn',1:'Tu',2:'Wd',3:'Th',4:'Fr',5:'St',6:'Sn'}
day_split = [day_to_dayofweek[d] for d in days]
df['wkno'] = df['time'].dt.isocalendar().week
df['dow'] = df['time'].dt.day_of_week
df['span'] = np.searchsorted(day_split,df['dow'],side='right')
span_name_dict = {i+1:short_dict[day_split[i]]+'-'+short_dict[(day_split+[6])[i+1]]
for i in range(len(day_split))}
df_agg = df.groupby(['wkno','span'])['value'].mean()
df_agg = df_agg.rename(index=span_name_dict,level=1)
return df_agg

Related

Capping multiples columns

I found an interesting snippet (vrana95) that caps multiple columns, however this function works on the main "df" as well instead to work only on "final_df". Someone knows why?
def cap_data(df):
for col in df.columns:
print("capping the ",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)
As I wanted to cap only a few columns I changed the for loop of the original snippet. It works, but I would to know why this function is working with both dataframes.
cols = ['score_3', 'score_6', 'credit_limit', 'last_amount_borrowed', 'reported_income', 'income']
def cap_data(df):
for col in cols:
print("capping the column:",col)
if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
percentiles = df[col].quantile([0.01,0.99]).values
df[col][df[col] <= percentiles[0]] = percentiles[0]
df[col][df[col] >= percentiles[1]] = percentiles[1]
else:
df[col]=df[col]
return df
final_df=cap_data(df)

Increment a time and add it in data frame column

Hi I am new to python and I am looking for below result.
I have From_Curr(3), To_Curr(3) and making currency pairs and adding new column in my data frame as time.
3*3 = 9 currency pairs created So I want same time for currency pairs and then increment by 1 hr again for same pairs as shown below.
Problem statement is time gets incremented after every row.
Actual df:
Expected df:
Thanks for any help and appreciate your time.
`
import pandas as pd
import datetime
from datetime import timedelta
data = pd.DataFrame({'From':["EUR","GBP",'USD'],
'To':["INR","SGD",'HKD'],
'time':''})
init_date = datetime.datetime(1, 1, 1)
for index, row in data.iterrows():
row['time'] = str(init_date)[11:19]
init_date = init_date + timedelta(hours=1.0)
`
I'm not understanding why you are repeating the combinations, and incrementing in one hour in the last half.
But for this case, you can do something like this:
import pandas as pd
data = pd.DataFrame({'From':["EUR","GBP",'USD'],
'To':["INR","SGD",'HKD'],
'time':''})
outlist = [ (i, j) for i in data["From"] for j in data["To"] ]*2 # Create double combinations
data = pd.DataFrame(data=outlist,columns=["From","To"])
data["time"] = "00:00:00"
data["time"].iloc[int(len(data)/2):len(data)] = "01:00:00" # Assign 1 hour to last half
data["time"] = pd.to_datetime(data["time"]).dt.time
Update: After some clarifications
import pandas as pd
data = pd.DataFrame(
{"From": ["EUR", "GBP", "USD"], "To": ["INR", "SGD", "HKD"], "time": ""}
)
outlist = [
(i, j) for i in data["From"] for j in data["To"]
] * 2 # Create double combinations, i think that for your case it would be 24 instead of two
data = pd.DataFrame(data=outlist, columns=["From", "To"])
data["time"] = data.groupby(["From", "To"]).cumcount() # Get counts of pairs values
data["time"] = data["time"] * pd.to_datetime("01:00:00").value # Multiply occurrences by the value of 1 hour
data["time"] = pd.to_datetime(data["time"]).dt.time # Pass to time
I think this script covers all your needs, happy coding :)
Regards,

Efficient way to expand a DataFrame in Julia

I have a dataframe with exposure episodes per case:
using DataFrames
using Dates
df = DataFrame(id = [1,1,2,3], startdate = [Date(2018,3,1),Date(2019,4,2),Date(2018,6,4),Date(2018,5,1)], enddate = [Date(2019,4,4),Date(2019,8,5),Date(2019,3,1),Date(2019,4,15)])
I want to expand each episode to its constituent days, eliminating any duplicate days per case resulting from overlapping episodes (case 1 in the example dataframe):
s = similar(df, 0)
for row in eachrow(df)
tf = DataFrame(row)
ttf = repeat(tf, Dates.value.(row.enddate - row.startdate) + 1)
ttf.daydate = ttf.startdate .+ Dates.Day.(0:nrow(ttf) - 1) #a record for each day between start and end days (inclusive)
ttf.start = ttf.daydate .== ttf.startdate #a flag to indicate this record was at the start of an episode
ttf.end = ttf.daydate .== ttf.enddate #a flag to indicate this record was at the end of an episode
append!(s, ttf, cols=:union)
end
sort!(s, [:id,:daydate,:startdate, order(:enddate, rev=true)])
unique!(s,[:id,:daydate]) #to eliminate duplicate dates in the case of episode overlaps (e.g. case 1)
I have a strong suspicion that there is a more efficient way of doing this than the brute force method I came up with and any help will be appreciated.
Implementation note: In the actual implementation there are several hundred thousand cases, each with relatively few episodes (median = 1, 75 percentile 3, maximum 20), but spanning 20 years or more of exposure resulting in a very large dataset (several 100 million records). To fit into available memory I have partitioned the dataset on id and used the Threads.#threads macro to loop through the partitions in parallel. The primary purpose of this decomposition into days is not just to eliminate overlaps, but to join the data with other exposure data that is available on a per day basis.
Below is a more complete solution that takes into account some essential details. Each episode is associated with additional attributes, as an example I used locationid (place where the exposure took place) and the need to indicate whether there was a gap between subsequent episodes. The original solution also did not cater for the special case where an episode is fully contained within another episode - such episodes should not be expanded.
using Dates
using DataFrames
function process(startdate, enddate, locationid)
start = startdate[1]
stop = enddate[1]
location = locationid[1]
res_daydate = collect(start:Day(1):stop)
res_startdate = fill(start, length(res_daydate))
res_enddate = fill(stop, length(res_daydate))
res_location = fill(location, length(res_daydate))
gap = 0
res_gap = fill(0, length(res_daydate))
for i in 2:length(startdate)
if startdate[i] > res_daydate[end]
start = startdate[i]
elseif enddate[i] > res_daydate[end]
start = res_daydate[end] + Day(1)
else
continue #this episode is contained within the previous episode
end
if start - res_daydate[end] > Day(1)
gap = gap==0 ? 1 : 0
end
stop = enddate[i]
location = locationid[i]
new_daydate = start:Day(1):stop
append!(res_daydate, new_daydate)
append!(res_startdate, fill(startdate[i], length(new_daydate)))
append!(res_enddate, fill(stop, length(new_daydate)))
append!(res_location, fill(location, length(new_daydate)))
append!(res_gap, fill(gap, length(new_daydate)))
end
return (daydate=res_daydate, startdate=res_startdate, enddate=res_enddate, locationid=res_location, gap = res_gap)
end
function eliminateoverlap()
df = DataFrame(id = [1,1,2,3,3,4,4], startdate = [Date(2018,3,1),Date(2019,4,2),Date(2018,6,4),Date(2018,5,1), Date(2019,5,1), Date(2012,1,1), Date(2012,2,2)],
enddate = [Date(2019,4,4),Date(2019,8,5),Date(2019,3,1),Date(2019,4,15),Date(2019,6,15),Date(2012,6,30), Date(2012,2,10)], locationid=[10,11,21,30,30,40,41])
dfs = sort(df, [:startdate, order(:enddate, rev=true)])
gdf = groupby(dfs, :id, sort=true)
r = combine(gdf, [:startdate, :enddate, :locationid] => process => AsTable)
df = combine(groupby(r, [:id,:gap,:locationid]), :daydate => minimum => :StartDate, :daydate => maximum => :EndDate)
return df
end
df = eliminateoverlap()
Here is something that should be efficient:
dfs = sort(df, [:startdate, order(:enddate, rev=true)])
gdf = groupby(dfs, :id, sort=true)
function process(startdate, enddate)
start = startdate[1]
stop = enddate[1]
res_daydate = collect(start:Day(1):stop)
res_startdate = fill(start, length(res_daydate))
res_enddate = fill(stop, length(res_daydate))
for i in 2:length(startdate)
if startdate[i] > res_daydate[end]
start = startdate[i]
stop = enddate[i]
elseif enddate[i] > res_daydate[end]
start = res_daydate[end] + Day(1)
stop = enddate[i]
end
new_daydate = start:Day(1):stop
append!(res_daydate, new_daydate)
append!(res_startdate, fill(startdate[i], length(new_daydate)))
append!(res_enddate, fill(stop, length(new_daydate)))
end
return (startdate=res_startdate, enddate=res_enddate, daydate=res_daydate)
end
combine(gdf, [:startdate, :enddate] => process => AsTable)
(but please check it with larger data against your implementation if it is correct as I have just written it quickly to show you how to do performant implementations with DataFrames.jl)

Date is not working even when date column is set to index

I have a multiple dataframe dictionary where the index is set to 'Date' but am having a trouble to capture the specific day of a search.
Dictionary created as per link:
Call a report from a dictionary of dataframes
Then I tried to add the following column to create specific days for each row:
df_dict[k]['Day'] = pd.DatetimeIndex(df['Date']).day
It´s not working. The idea is to separate the day of the month only (from 1 to 31) for each row. When I call the report, it will give me the day of month of that occurrence.
More details if needed.
Regards and thanks!
In the case of your code, there is no 'Date' column, because it's set as the index.
df_dict = {f.stem: pd.read_csv(f, parse_dates=['Date'], index_col='Date') for f in files}
To extract the day from the index use the following code.
df_dict[k]['Day'] = df.index.day
Pulling the code from this question
# here you can see the Date column is set as the index
df_dict = {f.stem: pd.read_csv(f, parse_dates=['Date'], index_col='Date') for f in files}
data_dict = dict() # create an empty dict here
for k, df in df_dict.items():
df_dict[k]['Return %'] = df.iloc[:, 0].pct_change(-1)*100
# create a day column; this may not be needed
df_dict[k]['Day'] = df.index.day
# aggregate the max and min of Return
mm = df_dict[k]['Return %'].agg(['max', 'min'])
# get the min and max day of the month
date_max = df.Day[df['Return %'] == mm.max()].values[0]
date_min = df.Day[df['Return %'] == mm.min()].values[0]
# add it to the dict, with ticker as the key
data_dict[k] = {'max': mm.max(), 'min': mm.min(), 'max_day': date_max, 'min_day': date_min}
# print(data_dict)
[out]:
{'aapl': {'max': 8.702843218147871,
'max_day': 2,
'min': -4.900700398891522,
'min_day': 20},
'msft': {'max': 6.603769278967109,
'max_day': 2,
'min': -4.084428935702855,
'min_day': 8}}

How to get dates along with the functions I perform?

My initial data frame is like this:
import pandas as pd
df = pd.DataFrame({'serialNo':['aaaa','aaaa','cccc','ffff'],
'Date':['2018-09-15','2018-09-16','2018-09-15','2018-09-19'],
'moduleLocation': ['face','head','stomach','legs'],
'moduleName': ['singing', 'dance','booze', 'vocals'],
'warning': [4402, 3747 ,5555,8754],
'failed':[0,3462,5161,3262]})
I have performed the following functions to clean up the data the first is to make all the datatypes as string:
all_columns = list(df)
df[all_columns] = df[all_columns].astype(str)
This is followed by the function to perform certain concatenations:
def concatenate(diagnostics, field, target):
diagnostics.sort_values(by=['serialNo',field],inplace=True)
diagnostics.drop_duplicates(inplace=True)
diagnostics[target] = \
diagnostics.groupby(['serialNo'], as_index=False)[field].transform(lambda s: ','.join(filter(None, s)))
diagnostics.drop([field],axis=1,inplace=True)
diagnostics.drop_duplicates(inplace=True)
return diagnostics
module = concatenate(df[['serialNo','moduleName']], 'moduleName', 'Module')
Warn = concatenate(df[['serialNo','warning']], 'warning', 'Warn')
Err = concatenate(df[['serialNo','failed']], 'failed', 'Err')
Location = concatenate(df[['serialNo','moduleLocation']], 'moduleLocation', 'Location')
diag_final = pd.merge(module,Warn,on=['serialNo'],how='inner')
diag_final = pd.merge(diag_final,Err,on=['serialNo'],how='inner')
diag_final = pd.merge(diag_final,Location,on=['serialNo'],how='inner')
Now the problem is the Date column no longer exists in my diag_final data frame and I would like to have it. I do not want to make changes to the existing function but just make sure that I have the corresponding Dates. How should I achieve this?
There are likely to be multiple values for each serial number. Hence, you will have to concatenate the values, similar what you are doing for moduleLocation, and moduleName.
dates = concatenate(df[['serialNo','Date']], 'Date', 'Date_cat')
diag_final = pd.merge(diag_final,dates,on=['serialNo'],how='inner')