Efficient way to expand a DataFrame in Julia - dataframe

I have a dataframe with exposure episodes per case:
using DataFrames
using Dates
df = DataFrame(id = [1,1,2,3], startdate = [Date(2018,3,1),Date(2019,4,2),Date(2018,6,4),Date(2018,5,1)], enddate = [Date(2019,4,4),Date(2019,8,5),Date(2019,3,1),Date(2019,4,15)])
I want to expand each episode to its constituent days, eliminating any duplicate days per case resulting from overlapping episodes (case 1 in the example dataframe):
s = similar(df, 0)
for row in eachrow(df)
tf = DataFrame(row)
ttf = repeat(tf, Dates.value.(row.enddate - row.startdate) + 1)
ttf.daydate = ttf.startdate .+ Dates.Day.(0:nrow(ttf) - 1) #a record for each day between start and end days (inclusive)
ttf.start = ttf.daydate .== ttf.startdate #a flag to indicate this record was at the start of an episode
ttf.end = ttf.daydate .== ttf.enddate #a flag to indicate this record was at the end of an episode
append!(s, ttf, cols=:union)
end
sort!(s, [:id,:daydate,:startdate, order(:enddate, rev=true)])
unique!(s,[:id,:daydate]) #to eliminate duplicate dates in the case of episode overlaps (e.g. case 1)
I have a strong suspicion that there is a more efficient way of doing this than the brute force method I came up with and any help will be appreciated.
Implementation note: In the actual implementation there are several hundred thousand cases, each with relatively few episodes (median = 1, 75 percentile 3, maximum 20), but spanning 20 years or more of exposure resulting in a very large dataset (several 100 million records). To fit into available memory I have partitioned the dataset on id and used the Threads.#threads macro to loop through the partitions in parallel. The primary purpose of this decomposition into days is not just to eliminate overlaps, but to join the data with other exposure data that is available on a per day basis.

Below is a more complete solution that takes into account some essential details. Each episode is associated with additional attributes, as an example I used locationid (place where the exposure took place) and the need to indicate whether there was a gap between subsequent episodes. The original solution also did not cater for the special case where an episode is fully contained within another episode - such episodes should not be expanded.
using Dates
using DataFrames
function process(startdate, enddate, locationid)
start = startdate[1]
stop = enddate[1]
location = locationid[1]
res_daydate = collect(start:Day(1):stop)
res_startdate = fill(start, length(res_daydate))
res_enddate = fill(stop, length(res_daydate))
res_location = fill(location, length(res_daydate))
gap = 0
res_gap = fill(0, length(res_daydate))
for i in 2:length(startdate)
if startdate[i] > res_daydate[end]
start = startdate[i]
elseif enddate[i] > res_daydate[end]
start = res_daydate[end] + Day(1)
else
continue #this episode is contained within the previous episode
end
if start - res_daydate[end] > Day(1)
gap = gap==0 ? 1 : 0
end
stop = enddate[i]
location = locationid[i]
new_daydate = start:Day(1):stop
append!(res_daydate, new_daydate)
append!(res_startdate, fill(startdate[i], length(new_daydate)))
append!(res_enddate, fill(stop, length(new_daydate)))
append!(res_location, fill(location, length(new_daydate)))
append!(res_gap, fill(gap, length(new_daydate)))
end
return (daydate=res_daydate, startdate=res_startdate, enddate=res_enddate, locationid=res_location, gap = res_gap)
end
function eliminateoverlap()
df = DataFrame(id = [1,1,2,3,3,4,4], startdate = [Date(2018,3,1),Date(2019,4,2),Date(2018,6,4),Date(2018,5,1), Date(2019,5,1), Date(2012,1,1), Date(2012,2,2)],
enddate = [Date(2019,4,4),Date(2019,8,5),Date(2019,3,1),Date(2019,4,15),Date(2019,6,15),Date(2012,6,30), Date(2012,2,10)], locationid=[10,11,21,30,30,40,41])
dfs = sort(df, [:startdate, order(:enddate, rev=true)])
gdf = groupby(dfs, :id, sort=true)
r = combine(gdf, [:startdate, :enddate, :locationid] => process => AsTable)
df = combine(groupby(r, [:id,:gap,:locationid]), :daydate => minimum => :StartDate, :daydate => maximum => :EndDate)
return df
end
df = eliminateoverlap()

Here is something that should be efficient:
dfs = sort(df, [:startdate, order(:enddate, rev=true)])
gdf = groupby(dfs, :id, sort=true)
function process(startdate, enddate)
start = startdate[1]
stop = enddate[1]
res_daydate = collect(start:Day(1):stop)
res_startdate = fill(start, length(res_daydate))
res_enddate = fill(stop, length(res_daydate))
for i in 2:length(startdate)
if startdate[i] > res_daydate[end]
start = startdate[i]
stop = enddate[i]
elseif enddate[i] > res_daydate[end]
start = res_daydate[end] + Day(1)
stop = enddate[i]
end
new_daydate = start:Day(1):stop
append!(res_daydate, new_daydate)
append!(res_startdate, fill(startdate[i], length(new_daydate)))
append!(res_enddate, fill(stop, length(new_daydate)))
end
return (startdate=res_startdate, enddate=res_enddate, daydate=res_daydate)
end
combine(gdf, [:startdate, :enddate] => process => AsTable)
(but please check it with larger data against your implementation if it is correct as I have just written it quickly to show you how to do performant implementations with DataFrames.jl)

Related

Pyspark dataframe process each row based on set of rows from the same dateframe

Below piece of code works fine for smaller dataframe as it will bring all data to driver memory but I want to leverage distributed computing of spark. Can someone help me on how to leverage Pyspark capabilities
def topNSuggestion(row, noSuggestionLabel, n):
#create output suggestion list
suggestion = [noSuggestionLabel] * n
#Create data frame for row received as an input to this method
suggestedNameDataFrame = pd.DataFrame({'name':row.index})
suggestedCountDataFrame = pd.DataFrame([eachRow for eachRow in row.values], columns=['countOrder','countQuantity'])
suggestedNameCountDataFrame = pd.concat([suggestedNameDataFrame,suggestedCountDataFrame], axis=1)
#Sort suggestion data frame on number of orders and quantity
sortSuggestedNameCountDataFrame = suggestedNameCountDataFrame.sort_values(by=['countOrder','countQuantity'], ascending=False)
sortSuggestedNameCountDataFrame = sortSuggestedNameCountDataFrame.reset_index(drop=True)
countProduct = sortSuggestedNameCountDataFrame.shape[0]
#Update suggestion list only if number of orders are more than zero else NO suggestions
countSuggestion = 0
while(countSuggestion < countProduct and countSuggestion < n):
if sortSuggestedNameCountDataFrame.countOrder[countSuggestion] > 0:
suggestion[countSuggestion] = str(sortSuggestedNameCountDataFrame.name[countSuggestion])
countSuggestion += 1
return suggestion
My main code to process each row in dataframe. I am using dataframe.collect method which mitigates benefits of distributed computing.
noSuggestionLabel = 'No Suggestion'
numberOfSuggestions = 5
#create product suggest data frame
customerProductSuggestDataFrame = aggregateQuantityFilterDataFrame.drop_duplicates(['CustomerId', 'ProductId', 'ProductName'])
customerProductSuggestDataFrame = customerProductSuggestDataFrame.drop('STORE_NUMBER', 'OrderId', 'AmountPurchased', 'ProductCode')
customerProductSuggestDataFrame = customerProductSuggestDataFrame.sort('CustomerId','ProductId')
for i in range(numberOfSuggestions):
suggestion = 'Suggestion' + str(i+1) + 'ItemDescription'
customerProductSuggestDataFrame = customerProductSuggestDataFrame.withColumn(suggestion, F.lit(noSuggestionLabel))
previousCustomerId = 0
#Process each product suggestion
customerProductSuggestDataFrameCollect = customerProductSuggestDataFrame.collect()
for eachCustomerProductSuggest in customerProductSuggestDataFrameCollect:
if previousCustomerId != eachCustomerProductSuggest.CustomerId:
currentCustomerId = eachCustomerProductSuggest.CustomerId
#Filter customer level purchase history info
eachCustomerAggregateQuantityDataFrame = aggregateQuantityFilterDataFrame.filter(aggregateQuantityFilterDataFrame.CustomerId == eachCustomerProductSuggest.CustomerId)
#Create product matrix for each customer
#Get all unqiue product code from store purchase history info for this customer
uniqueProducts = list(eachCustomerAggregateQuantityDataFrame.select('ProductId').distinct().toPandas()['ProductId'])
uniqueProducts.sort()
#create product ranking data frame with rows and columns as unique product code only
productDataFrame = pd.DataFrame(index=uniqueProducts,columns=uniqueProducts)
for index,eachRow in productDataFrame.iterrows():
for col in productDataFrame.columns:
productDataFrame[index][col] = dict({'countOrder': 0, 'countQuantity':0})
previousOrderId = 0
eachCustomerAggregateQuantityDataFrameCollect = eachCustomerAggregateQuantityDataFrame.collect()
for eachCustomerOrderAggr in eachCustomerAggregateQuantityDataFrameCollect:
if previousOrderId != eachCustomerOrderAggr.OrderId:
productQuantityDict = {}
previousOrderId = eachCustomerOrderAggr.OrderId
#Iterate through all products to process previous products in the same order
for previousProductOrder in productQuantityDict:
#Only if not same product
if previousProductOrder != eachCustomerOrderAggr.ProductId:
#Update number of orders for each product in product matrix
productDataFrame.loc[eachCustomerOrderAggr.ProductId,previousProductOrder]['countOrder'] = productDataFrame.loc[eachCustomerOrderAggr.ProductId,previousProductOrder]['countOrder'] + 1
productDataFrame.loc[previousProductOrder,eachCustomerOrderAggr.ProductId]['countOrder'] = productDataFrame.loc[previousProductOrder,eachCustomerOrderAggr.ProductId]['countOrder'] + 1
#Update quantity for each product in product matrix
productDataFrame.loc[eachCustomerOrderAggr.ProductId,previousProductOrder]['countQuantity'] = productDataFrame.loc[eachCustomerOrderAggr.ProductId,previousProductOrder]['countQuantity'] + productQuantityDict[previousProductOrder]
productDataFrame.loc[previousProductOrder,eachCustomerOrderAggr.ProductId]['countQuantity'] = productDataFrame.loc[previousProductOrder,eachCustomerOrderAggr.ProductId]['countQuantity'] + eachCustomerOrderAggr.AmountPurchased
#Add product that is processed to product list
productQuantityDict[eachCustomerOrderAggr.ProductId] = eachCustomerOrderAggr.AmountPurchased
#Create dataframe with top n suggestions for each product
productSuggestion = productDataFrame.apply(lambda row:topNSuggestion(row, noSuggestionLabel, n=numberOfSuggestions), axis=1)
#Add Suggestions created for each ProductId to current customer
if currentCustomerId == eachCustomerProductSuggest.CustomerId:
eachSuggestionList = productSuggestion.loc[eachCustomerProductSuggest.ProductId]
#Append suggestions to product suggestion data frame
count = 1
for eachSuggestion in eachSuggestionList:
suggestion = 'Suggestion' + str(count) + 'ItemDescription'
if eachSuggestion != noSuggestionLabel:
eachSuggestionProductName = productInfoDataFrame.filter(productInfoDataFrame.ProductId == eachSuggestion).collect()[0]['ProductName']
else:
eachSuggestionProductName = str(eachSuggestion)
customerProductSuggestDataFrame = customerProductSuggestDataFrame.withColumn(suggestion, F.when((F.col("CustomerId") == eachCustomerProductSuggest.CustomerId)&(F.col("ProductId") == eachCustomerProductSuggest.ProductId), F.lit(eachSuggestionProductName)).otherwise(F.col(suggestion)))
count += 1
previousCustomerId = eachCustomerProductSuggest.CustomerId
Any help to direct me is appreciated

Sliding window method over a large range using numpy vectorization

I'm trying to implement a sliding window method for a genomics dataset that I have, over a fairly long range (upwards of 50k nucleotide's). My approach so far works fine, however is fairly slow (taking several seconds per range, and several minutes per range at intervals >150k bp). Here is my code so far:
import numpy as np
VectorizedRange = np.arange(Start, End)#Start, End genomic flags on the reference genome
SlidingWindow = np.lib.stride_tricks.sliding_window_view(VectorizedRange, 100)#100 = the window size
GroupedDictFrame = pd.DataFrame({"Bins":GenomeRange})
GroupedDictFrame["ReadCov"] = 0
GroupedDictFrame["ReadSeq"] = [list() for _ in range(len(GroupedDictFrame.index.values))]
GroupedDictFrame.set_index(keys=["Bins"], inplace=True, drop=True)
def Appender(Start, End, Width, Seq):
AvgCov = 0
SeqList = []
if End <= Window[-1]:
AvgCov += 1
SeqList.append(Seq)
elif End > Window[-1]:
AvgCov += (Window[-1] - Start)/Width
SeqList.append(Seq[0:(Window[-1] - Start)])
GroupedDictFrame.loc[Window[0], "ReadCov"] += AvgCov
GroupedDictFrame.loc[Window[0], "ReadSeq"] = SeqList
for Window in SlidingWindow:
SubsetBAM = BAMFrame[(
(BAMFrame["start_coord"]>=Window[0])&
(BAMFrame["start_coord"]<=Window[-1])
)].reset_index(drop=True)
SubsetBAM.apply(
lambda x: Appender(x.start_coord,
x.end_coord,
x.width_lis,
x.seq_lis), axis=1
)
I think my vectorization isn't the best, any suggestions for speeding this up?
So I think I figured it out on my own, I'll add my solution in case anyone else faces a similar problem.
Essentially, I stopped subsetting my dataframe containing the small DNA read fragments in the for loop, and did one subset before the loop and converted it to a numpy array.
I removed my function and used numpy.where to do all my logic.
import numpy as np
VectorizedRange = np.arange(Start, End)
SlidingWindow = np.lib.stride_tricks.sliding_window_view(VectorizedRange, 100)
GroupedDictFrame = pd.DataFrame({"Bins":GenomeRange})
GroupedDictFrame["ReadCov"] = 0
GroupedDictFrame["ReadSeq"] = [list() for _ in range(len(GroupedDictFrame.index.values))]
GroupedDictFrame.set_index(keys=["Bins"], inplace=True, drop=True)
CoordArray = BAMFrame.loc[:, "start_coord":"end_coord"].to_numpy()
for Window in SlidingWindow:
ReadCovIn = np.where(((CoordArray[:,1] <= Window[-1]) & (CoordArray[:,0] >= Window[0])), 1, 0)
ReadCovOut = np.where(((CoordArray[:,1] > Window[-1]) & ((CoordArray[:,0] >= Window[0]) & (CoordArray[:,0] < Window[-1]))),
(Window[-1] - CoordArray[:,0])/(CoordArray[:,1] - CoordArray[:,0]), 0)
GroupedDictFrame.loc[Window[0], "ReadCov"] += np.sum((np.sum(ReadCovIn), np.sum(ReadCovOut)))
I've gotten it down to ~1 second per gene region which is typically about 50kb (so that would mean the SlidingWindow has a shape of (49900,100)), which is pretty good I think!

pandas grooup by according to group of days of the week selected

I have this dataframe:
rng = pd.date_range(start='2018-01-01', end='2018-01-21')
rnd_values = np.random.rand(len(rng))+3
df = pd.DataFrame({'time':rng.to_list(),'value':rnd_values})
let's say that I want to group it according to the day of the week and compute the mean:
df['span'] = np.where((df['time'].dt.day_of_week <= 2 , 'Th-Sn', 'Mn-Wd')
df['wkno'] = df['time'].dt.isocalendar().week.shift(fill_value=0)
df.groupby(['wkno','span']).mean()
However, I would like to make this procedure more general.
Let's say that I define the following day is the week:
days=['Monday','Thursday']
Is there any option that allows me to do what I have done by using "days". I imagine that I have to compute the number of day between 'Monday','Thursday' and then I should use that number. What about the case when
days=['Monday','Thursday','Friday']
I was thinking to set-up a dictionary as:
days={'Monday':0,'Thursday':3,'Friday':4}
then
idays = list(days.values())[:]
How can I use now idays inside np.where? Indeed I have three interval.
Thanks
If you want to use more than one threshold you need np.searchsorted the resulting function would look something like
def groupby_daysspan_week(dfc,days):
df = dfc.copy()
day_to_dayofweek = {'Monday':0,'Tuesday':1,'Wednesday':2,
'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}
short_dict = {0:'Mn',1:'Tu',2:'Wd',3:'Th',4:'Fr',5:'St',6:'Sn'}
day_split = [day_to_dayofweek[d] for d in days]
df['wkno'] = df['time'].dt.isocalendar().week
df['dow'] = df['time'].dt.day_of_week
df['span'] = np.searchsorted(day_split,df['dow'],side='right')
span_name_dict = {i+1:short_dict[day_split[i]]+'-'+short_dict[(day_split+[6])[i+1]]
for i in range(len(day_split))}
df_agg = df.groupby(['wkno','span'])['value'].mean()
df_agg = df_agg.rename(index=span_name_dict,level=1)
return df_agg

Pandas Timeseries: Total duration meeting a specific condition

I have a timeseries
ts = pd.Series(data=[0,1,2,3,4],index=[pd.Timestamp('1991-01-01'),pd.Timestamp('1995-01-01'),pd.Timestamp('1996-01-01'),pd.Timestamp('2010-01-01'),pd.Timestamp('2011-01-01')])
Whats the fastest, most readable, way to get the total duration in which the value is below 2, assuming the values are valid until the next time-step indicates otherwise (no linear interpolation). I imagine there probably is a pandas function for this
This seems to be working quite well, however I am still baffled that there does not seem to be a pandas function for this!
import pandas as pd
import numpy as np
ts = pd.Series(data=[0,1,2,3,4],index=[pd.Timestamp('1991-01-01'),pd.Timestamp('1995-01-01'),pd.Timestamp('1996-01-01'),pd.Timestamp('2010-01-01'),pd.Timestamp('2011-01-01')])
# making the timeseries binary. 1 = meets condition, 0 = does not
ts = ts.where(ts>=2,other=1)
ts = ts.where(ts<2,other=0)
delta_time = ts.index.to_pydatetime()[1:]-ts.index.to_pydatetime()[:-1]
time_below_2 = np.sum(delta_time[np.invert(ts.values[:-1])]).total_seconds()
time_above_2 = np.sum(delta_time[(ts.values[:-1])]).total_seconds()
The above function seems to break for certain timeframes. This option is slower, but did not break in any of my tests:
def get_total_duration_above_and_below_value(value,ts):
# making the timeseries binary. 1 = above value, 0 = below value
ts = ts.where(ts >= value, other=1)
ts = ts.where(ts < value, other=0)
time_above_value = 0
time_below_value = 0
for i in range(ts.size - 1):
if ts[i] == 1:
time_above_value += abs(pd.Timedelta(
ts.index[i] - ts.index[i + 1]).total_seconds()) / 3600
else:
time_below_value += abs(pd.Timedelta(
ts.index[i] - ts.index[i + 1]).total_seconds()) / 3600
return time_above_value, time_below_value

Data handling on multiple Heart rate files

I have been collecting the Heart rates of 12 calves who each received an anesthetic through four different routes of administration. I now have 48 txt files of this format:
Time HRbpm
0:00:01.7 97
0:00:02.3 121
0:00:02.8 15
... ...
HR was recorded for around 2hours. The Time column was dependent of the monitor, resulting in inconsistent time intervals between two measures.
The txt files are named as follows: 6133_IM_27.00.txt
With 6133 being the ID, IM the route and 27.00 the time (min:min.s:s) at which the treatment was injected.
My first goal is to have all the HR data so I can do an outlier analysis.
Then, I would like to include all this data in a single data frame that would look like this:
data.frame(ID=c(6133,6133,6133,6133,"...",6134,6134,"..."),
Route = c("IM","IM","IM","IM","...","SC","SC","..."),
time=c(0, 10, 20, 30,"...",0,10,"..."),
HR=c(160, 150, 145, 130,"...",162,158,"..."))
Time column going from 0 to 120 in 10min increments.
Each HR of this df would represent the mean of the HR values for the preceding minute for a given time (e.g. for time = 30, HR would represent the mean between 29 and 30 minutes for a given ID/Route combination).
I'm fairly new to R, so I've been having trouble just knowing by what angle starting on that problem. Any help would be welcome.
Thanks,
Thomas
For anyone who stumbles on this post, here's what I've done, seems to be working.
library(plyr)
library(reshape)
library(ggplot2)
setwd("/directory")
filelist = list.files(pattern = ".*.txt")
datalist = lapply(filelist, read.delim)
for (i in 1:length(datalist))
{datalist[[i]][3] = filelist[i]}
df = do.call("rbind", datalist)
attach(df)
out_lowHR = quantile(HRbpm,0.25)-1.5*IQR(HRbpm)
out_highHR = quantile(HRbpm,0.75)+1.5*IQR(HRbpm) #outliers thresholds: 60 and 200
dfc = subset(df,HRbpm>=60 & HRbpm<=200)
(length(df$HRbpm)-length(dfc$HRbpm))/length(df$HRbpm)*100 #8.6% of values excluded
df = dfc
df$ID = substr(df$V3,4,7)
df$ROA = substr(df$V3,9,11)
df$ti = substr(df$V3,13,17)
df$Time = as.POSIXct(as.character(df$Time), format="%H:%M:%S")
df$ti = as.POSIXct(as.character(df$ti), format="%M.%S")
df$t = as.numeric(df$Time-df$ti)
m=60
meanHR = ddply(df, c("ROA","ID"), summarise,
mean0 = mean(HRbpm[t>-60*m & t <=0]),
mean10 = mean(HRbpm[t>9*m & t <=10*m]),
mean20 = mean(HRbpm[t>19*m & t <=20*m]),
mean30 = mean(HRbpm[t>29*m & t <=30*m]),
mean45 = mean(HRbpm[t>44*m & t <=45*m]),
mean60 = mean(HRbpm[t>59*m & t <=60*m]),
mean90 = mean(HRbpm[t>89*m & t <=90*m]),
mean120 = mean(HRbpm[t>119*m & t <=120*m]))
meanHR = melt(meanHR)
meanHR$time = as.numeric(gsub("mean", "", meanHR$variable))
ggplot(meanHR, aes(x = time, y = value, col = ROA))+
geom_smooth()+
theme_classic()