Related
I have a dataframe with exposure episodes per case:
using DataFrames
using Dates
df = DataFrame(id = [1,1,2,3], startdate = [Date(2018,3,1),Date(2019,4,2),Date(2018,6,4),Date(2018,5,1)], enddate = [Date(2019,4,4),Date(2019,8,5),Date(2019,3,1),Date(2019,4,15)])
I want to expand each episode to its constituent days, eliminating any duplicate days per case resulting from overlapping episodes (case 1 in the example dataframe):
s = similar(df, 0)
for row in eachrow(df)
tf = DataFrame(row)
ttf = repeat(tf, Dates.value.(row.enddate - row.startdate) + 1)
ttf.daydate = ttf.startdate .+ Dates.Day.(0:nrow(ttf) - 1) #a record for each day between start and end days (inclusive)
ttf.start = ttf.daydate .== ttf.startdate #a flag to indicate this record was at the start of an episode
ttf.end = ttf.daydate .== ttf.enddate #a flag to indicate this record was at the end of an episode
append!(s, ttf, cols=:union)
end
sort!(s, [:id,:daydate,:startdate, order(:enddate, rev=true)])
unique!(s,[:id,:daydate]) #to eliminate duplicate dates in the case of episode overlaps (e.g. case 1)
I have a strong suspicion that there is a more efficient way of doing this than the brute force method I came up with and any help will be appreciated.
Implementation note: In the actual implementation there are several hundred thousand cases, each with relatively few episodes (median = 1, 75 percentile 3, maximum 20), but spanning 20 years or more of exposure resulting in a very large dataset (several 100 million records). To fit into available memory I have partitioned the dataset on id and used the Threads.#threads macro to loop through the partitions in parallel. The primary purpose of this decomposition into days is not just to eliminate overlaps, but to join the data with other exposure data that is available on a per day basis.
Below is a more complete solution that takes into account some essential details. Each episode is associated with additional attributes, as an example I used locationid (place where the exposure took place) and the need to indicate whether there was a gap between subsequent episodes. The original solution also did not cater for the special case where an episode is fully contained within another episode - such episodes should not be expanded.
using Dates
using DataFrames
function process(startdate, enddate, locationid)
start = startdate[1]
stop = enddate[1]
location = locationid[1]
res_daydate = collect(start:Day(1):stop)
res_startdate = fill(start, length(res_daydate))
res_enddate = fill(stop, length(res_daydate))
res_location = fill(location, length(res_daydate))
gap = 0
res_gap = fill(0, length(res_daydate))
for i in 2:length(startdate)
if startdate[i] > res_daydate[end]
start = startdate[i]
elseif enddate[i] > res_daydate[end]
start = res_daydate[end] + Day(1)
else
continue #this episode is contained within the previous episode
end
if start - res_daydate[end] > Day(1)
gap = gap==0 ? 1 : 0
end
stop = enddate[i]
location = locationid[i]
new_daydate = start:Day(1):stop
append!(res_daydate, new_daydate)
append!(res_startdate, fill(startdate[i], length(new_daydate)))
append!(res_enddate, fill(stop, length(new_daydate)))
append!(res_location, fill(location, length(new_daydate)))
append!(res_gap, fill(gap, length(new_daydate)))
end
return (daydate=res_daydate, startdate=res_startdate, enddate=res_enddate, locationid=res_location, gap = res_gap)
end
function eliminateoverlap()
df = DataFrame(id = [1,1,2,3,3,4,4], startdate = [Date(2018,3,1),Date(2019,4,2),Date(2018,6,4),Date(2018,5,1), Date(2019,5,1), Date(2012,1,1), Date(2012,2,2)],
enddate = [Date(2019,4,4),Date(2019,8,5),Date(2019,3,1),Date(2019,4,15),Date(2019,6,15),Date(2012,6,30), Date(2012,2,10)], locationid=[10,11,21,30,30,40,41])
dfs = sort(df, [:startdate, order(:enddate, rev=true)])
gdf = groupby(dfs, :id, sort=true)
r = combine(gdf, [:startdate, :enddate, :locationid] => process => AsTable)
df = combine(groupby(r, [:id,:gap,:locationid]), :daydate => minimum => :StartDate, :daydate => maximum => :EndDate)
return df
end
df = eliminateoverlap()
Here is something that should be efficient:
dfs = sort(df, [:startdate, order(:enddate, rev=true)])
gdf = groupby(dfs, :id, sort=true)
function process(startdate, enddate)
start = startdate[1]
stop = enddate[1]
res_daydate = collect(start:Day(1):stop)
res_startdate = fill(start, length(res_daydate))
res_enddate = fill(stop, length(res_daydate))
for i in 2:length(startdate)
if startdate[i] > res_daydate[end]
start = startdate[i]
stop = enddate[i]
elseif enddate[i] > res_daydate[end]
start = res_daydate[end] + Day(1)
stop = enddate[i]
end
new_daydate = start:Day(1):stop
append!(res_daydate, new_daydate)
append!(res_startdate, fill(startdate[i], length(new_daydate)))
append!(res_enddate, fill(stop, length(new_daydate)))
end
return (startdate=res_startdate, enddate=res_enddate, daydate=res_daydate)
end
combine(gdf, [:startdate, :enddate] => process => AsTable)
(but please check it with larger data against your implementation if it is correct as I have just written it quickly to show you how to do performant implementations with DataFrames.jl)
Trying to iteratively add vertices and edges. It seems to work, there are no errors, but I wish to verify that the edges are also correctly added.
The loops below insert at least the nodes, as the printing of the list length at the end shows, but the edges are either 1) not inserted, or 2) the way to collect them in a list is incorrect.
Any help is much appreciated!
def vertices01(nodename, rangelb, rangeub, prop1name, prop1val, prop2name):
t = g.addV(nodename).property(prop1name, prop1val).property(prop2name, rangelb)
for i in range(rangelb + 1, rangeub):
t.addV(nodename).property(prop1name, prop1val).property(prop2name, i)
t.iterate()
def edges01(from_propname, from_propval, to_propname, rangelb, rangeub, edge_name, edge_prop1name):
to_propval = rangelb
edge_prop1val = rangelb
t = g.V().has(from_propname, from_propval).as_("a").V().has(to_propname, to_propval).as_("b").addE(edge_name).from_("a").to("b").property(edge_prop1name, edge_prop1val)
for i in range(rangelb, rangeub):
to_propval = i + 1
edge_prop1val = i
# changing this to t.has(...) seems to not influence the results (still 0 picked up by the loop)
t.has(from_propname, from_propval).as_("a").V().has(to_propname, to_propval).as_("b").addE(edge_name).from_("a").to("b").property(edge_prop1name, edge_prop1val)
t.iterate()
vertices01("ABC", 1, 21, "aa01", 1, "bb01")
edges01("aa01", 1, "bb01", 1, 10 , "aa01-to-bb01", "aa01-to-bb01-propX")
ls1 = []
ls1 = g.V().outE("aa01-to-bb01").has("aa01-to-bb01-propX", 2).toList()
print(len(ls1))
ls2 = []
ls2 = g.V().has("aa01", 1).toList()
print(len(ls2))
> results:
0
20
Expected results:
> results:
1
20
EDIT: I have changed this bit in the edges01 loop:
t = g.V().has(from_propname, from_propval) ...
to
t.has(from_propname, from_propval) ...
But the results are still 0.
You are starting the traversal over again each time with t = g.V()... in the code that adds edges. Only the very last traversal created is going to get iterated. In the code that creates the vertices you are extending the traversal. That is the difference.
UPDATED
You should be able to do something along these lines
t = g.V().has('some-property','some-value').as_('a').
V().has('some-property','some-value').as_('b')
and then inside the loop
t.addE('myedge').from_('a').to('b')
I am trying to optimize MACD parameters for a trading strategy but unfortunately I am stuck with paramset.label value. This is the code:
################################# MACD PARAMETERS OPTIMIZATION
.fastMA <- (20:40)
.slowMA <- (30:70)
.nsamples = 10
strat.st <- 'volStrat'
# Paramset
add.distribution(strat.st,
paramset.label = 'EMA',
component.type = 'indicator',
component.label = 'macd.out',
variable = list(n = .fastMA),
label = 'nFast'
)
add.distribution(strat.st,
paramset.label = 'EMA',
component.type = 'indicator',
component.label = 'macd.out',
variable = list(n = .slowMA),
label = 'nSlow'
)
add.distribution.constraint(strat.st,
paramset.label = 'EMA',
distribution.label.1 = 'nFast',
distribution.label.2 = 'nSlow',
operator = '<',
label = 'nFast<nSlow'
)
results <- apply.paramset(strat.st,
paramset.label = 'EMA',
portfolio = portfolio2.st,
account = account.st,
nsamples = .nsamples,
verbose = TRUE)
stats <- results$tradeStats
print(stats)
When I run it, this error occurs for every sample:
evaluation # 1:
$param.combo
nFast nSlow
379 23 51
[1] "Processing param.combo 379"
nFast nSlow
379 23 51
result of evaluating expression:
<simpleError in strategy[[components.type]][[index]]: subscript out of bounds>
got results for task 1
numValues: 1, numResults: 1, stopped: FALSE
returning status FALSE
And then, for the last one, this is the error:
evaluation # 10:
$param.combo
nFast nSlow
585 40 60
[1] "Processing param.combo 585"
nFast nSlow
585 40 60
result of evaluating expression:
<simpleError in strategy[[components.type]][[index]]: subscript out of bounds>
got results for task 10
numValues: 10, numResults: 10, stopped: FALSE
first call to combine function
evaluating call object to combine results:
fun(result.1, result.2, result.3, result.4, result.5, result.6,
result.7, result.8, result.9, result.10)
error calling combine function:
<simpleError in fun(result.1, result.2, result.3, result.4, result.5, result.6, result.7, result.8, result.9, result.10): attempt to select less than one element>
numValues: 10, numResults: 10, stopped: TRUE
I really don't understand how can I fix it.
Can anyone how can I solve this?
Thank you so much
You didn't give the code before OPTIMIZATION part, so here is only my guess direction.
I understand you want to test 20:40 and 30:70, but in your OPTIMIZATION code, you add 2 distribution both pointing to " component.label = 'macd.out' ".
I did similar test, although they both use MA type indicators, they generally should not point to the same MA data(" component.label = 'macd.out' "), these code worked one distribution points to "component.label = 'fast'" and another points to "component.label = 'slow'" as they are pointing different datas so that they can be compared.
You can try to debug in this direction.
I am creating some code for a school project, and for a module I use later on, I need to know what the intensity to end on(end_intensity) is. When the code is run, the end_intensity still comes out as unassigned, this means that the
if client_intensity == "High":
line is never being run.Can someone please explain why it won't assign .
correct = False
end_intensity = "Unassigned"
while correct != True:
id_search = input("please enter the Client ID of the client you wish to record results for:")
# open file, with will automatically close it for you
with open("text_documents/clientIntensity.txt") as f:
user_found = False
# loop over every line
for line in f:
client,intensity = line.split(",")
if id_search == client:
correct = True
user_found = True
intensity = str (intensity)
client_intensity = intensity
#assigns which one is the end intensity
if intensity == 'High':
end_intensity = 'Moderate'
elif intensity == 'Moderate':
end_intensity = 'High'
if user_found == False:
print("I'm sorry no results we're found for that ID, please try again\n")
print(end_intensity)
The text document is in this format:
NeQua,High
ImKol,Moderate
YoTri,Moderate
(I apologize for the numbers for the text document formatting,stack overflow would only let me show it like that)
Any help would be appreciated,Thanks
Ieuan
The following code snippet is from a python poker server. The program works except when trying to delay the start of a tourney when a reactor.callLater is used.
The variable "wait" gets its integer from an xml file which has a setting of "60". However the delay is never implemented and the tourney always starts immediately. I am not very familiar with python or twisted just trying to hack this into working for me. One thing however from my perspective is it seems that it shouldn't work given that I can't see how or where the variable "old_state" gets its value in order for the code to properly determine the states of the server. But perhaps I am mistaken.
I hope that someone familiar with python and twisted can see what the problem might be and be willing to enlighten me on this issue.
elif old_state == TOURNAMENT_STATE_REGISTERING and new_state == TOURNAMENT_STATE_RUNNING:
self.databaseEvent(event = PacketPokerMonitorEvent.TOURNEY_START, param1 = tourney.serial)
reactor.callLater(0.01, self.tourneyBroadcastStart, tourney.serial)
# Only obey extra_wait_tourney_start if we had been registering and are now running,
# since we only want this behavior before the first deal.
wait = int(self.delays.get('extra_wait_tourney_start', 0))
if wait > 0:
reactor.callLater(wait, self.tourneyDeal, tourney)
else:
self.tourneyDeal(tourney)
For reference I have placed the larger portion of the code that is relative to the problem.
def spawnTourneyInCore(self, tourney_map, tourney_serial, schedule_serial, currency_serial, prize_currency):
tourney_map['start_time'] = int(tourney_map['start_time'])
if tourney_map['sit_n_go'] == 'y':
tourney_map['register_time'] = int(seconds()) - 1
else:
tourney_map['register_time'] = int(tourney_map.get('register_time', 0))
tourney = PokerTournament(dirs = self.dirs, **tourney_map)
tourney.serial = tourney_serial
tourney.verbose = self.verbose
tourney.schedule_serial = schedule_serial
tourney.currency_serial = currency_serial
tourney.prize_currency = prize_currency
tourney.bailor_serial = tourney_map['bailor_serial']
tourney.player_timeout = int(tourney_map['player_timeout'])
tourney.via_satellite = int(tourney_map['via_satellite'])
tourney.satellite_of = int(tourney_map['satellite_of'])
tourney.satellite_of, reason = self.tourneySatelliteLookup(tourney)
tourney.satellite_player_count = int(tourney_map['satellite_player_count'])
tourney.satellite_registrations = []
tourney.callback_new_state = self.tourneyNewState
tourney.callback_create_game = self.tourneyCreateTable
tourney.callback_game_filled = self.tourneyGameFilled
tourney.callback_destroy_game = self.tourneyDestroyGame
tourney.callback_move_player = self.tourneyMovePlayer
tourney.callback_remove_player = self.tourneyRemovePlayer
tourney.callback_cancel = self.tourneyCancel
if not self.schedule2tourneys.has_key(schedule_serial):
self.schedule2tourneys[schedule_serial] = []
self.schedule2tourneys[schedule_serial].append(tourney)
self.tourneys[tourney.serial] = tourney
return tourney
def deleteTourney(self, tourney):
if self.verbose > 2:
self.message("deleteTourney: %d" % tourney.serial)
self.schedule2tourneys[tourney.schedule_serial].remove(tourney)
if len(self.schedule2tourneys[tourney.schedule_serial]) <= 0:
del self.schedule2tourneys[tourney.schedule_serial]
del self.tourneys[tourney.serial]
def tourneyResumeAndDeal(self, tourney):
self.tourneyBreakResume(tourney)
self.tourneyDeal(tourney)
def tourneyNewState(self, tourney, old_state, new_state):
cursor = self.db.cursor()
updates = [ "state = '" + new_state + "'" ]
if old_state != TOURNAMENT_STATE_BREAK and new_state == TOURNAMENT_STATE_RUNNING:
updates.append("start_time = %d" % tourney.start_time)
sql = "update tourneys set " + ", ".join(updates) + " where serial = " + str(tourney.serial)
if self.verbose > 2:
self.message("tourneyNewState: " + sql)
cursor.execute(sql)
if cursor.rowcount != 1:
self.error("modified %d rows (expected 1): %s " % ( cursor.rowcount, sql ))
cursor.close()
if new_state == TOURNAMENT_STATE_BREAK:
# When we are entering BREAK state for the first time, which
# should only occur here in the state change operation, we
# send the PacketPokerTableTourneyBreakBegin. Note that this
# code is here and not in tourneyBreakCheck() because that
# function is called over and over again, until the break
# finishes. Note that tourneyBreakCheck() also sends a
# PacketPokerGameMessage() with the time remaining, too.
secsLeft = tourney.remainingBreakSeconds()
if secsLeft == None:
# eek, should I really be digging down into tourney's
# member variables in this next assignment?
secsLeft = tourney.breaks_duration
resumeTime = seconds() + secsLeft
for gameId in map(lambda game: game.id, tourney.games):
table = self.getTable(gameId)
table.broadcast(PacketPokerTableTourneyBreakBegin(game_id = gameId, resume_time = resumeTime))
self.tourneyBreakCheck(tourney)
elif old_state == TOURNAMENT_STATE_BREAK and new_state == TOURNAMENT_STATE_RUNNING:
wait = int(self.delays.get('extra_wait_tourney_break', 0))
if wait > 0:
reactor.callLater(wait, self.tourneyResumeAndDeal, tourney)
else:
self.tourneyResumeAndDeal(tourney)
elif old_state == TOURNAMENT_STATE_REGISTERING and new_state == TOURNAMENT_STATE_RUNNING:
self.databaseEvent(event = PacketPokerMonitorEvent.TOURNEY_START, param1 = tourney.serial)
reactor.callLater(0.01, self.tourneyBroadcastStart, tourney.serial)
# Only obey extra_wait_tourney_start if we had been registering and are now running,
# since we only want this behavior before the first deal.
wait = int(self.delays.get('extra_wait_tourney_start', 0))
if wait > 0:
reactor.callLater(wait, self.tourneyDeal, tourney)
else:
self.tourneyDeal(tourney)
elif new_state == TOURNAMENT_STATE_RUNNING:
self.tourneyDeal(tourney)
elif new_state == TOURNAMENT_STATE_BREAK_WAIT:
self.tourneyBreakWait(tourney)
I have discovered that this code has several imported files that were in another directory that I did not examine. I also made a false assumption of the purpose of this code block. I expected the function to be arbitrary and delay each tourney by n seconds but in practice it implements the delay only when a player forgets about the game and does not show up for it. These facts were made clear once I examined the proper files. Lesson learned. Look at all the imports!