dataframe results changes to zero after adding return - pandas

I am trying to pass "buy_list" in the code below to df . This is a small section of the code when the full code is executed I get the results of a back test results linked image.
initial results
replacement_stocks = portfolio_size - len(kept_positions)
buy_list = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
new_portfolio = pd.concat(
(buy_list,
ranking_table.loc[ranking_table.index.isin(kept_positions)])
)
When I define df as in below I get df not defined error
replacement_stocks = portfolio_size - len(kept_positions)
buy_list = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
new_portfolio = pd.concat(
(buy_list,
ranking_table.loc[ranking_table.index.isin(kept_positions)])
)
df1 = buy_list # ceate df1 with buy_list
df2 = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]# create df2 with buy_list
I tried the solution in the link below
Similar error with suggested fix
Following this I still get df not defined error and the output of my back test changes to 0% in all the month which previously had actual % change negative and positive
replacement_stocks = portfolio_size - len(kept_positions)
buy_list = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
new_portfolio = pd.concat(
(buy_list,
ranking_table.loc[ranking_table.index.isin(kept_positions)])
)
return buy_list
df2 = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
print(df2)
This is what I now end up with
Error message
I'd appreciate any suggestions on how I can fix this.
Thanks,
Last1
Below is the full code as requested, it's from a book am working through, Trading Evolved by Andreas Clenow.
Thanks again.
%matplotlib inline
import zipline
from zipline.api import order_target_percent, symbol, \
set_commission, set_slippage, schedule_function, \
date_rules, time_rules, attach_pipeline, pipeline_output
from pandas import Timestamp
import matplotlib.pyplot as plt
import pyfolio as pf
import pandas as pd
import numpy as np
from scipy import stats
from zipline.finance.commission import PerDollar
from zipline.finance.slippage import VolumeShareSlippage, FixedSlippage
from zipline_norgatedata.pipelines import NorgateDataIndexConstituent
from zipline.pipeline import Pipeline
"""
Model Settings
"""
intial_portfolio = 100000
momentum_window1 = 125
momentum_window2 = 125
minimum_momentum = 40
portfolio_size = 30
vola_window = 20
# Trend filter settings
enable_trend_filter = True
trend_filter_symbol = '$SPXTR'
trend_filter_window = 200
"""
Commission and Slippage Settings
"""
enable_commission = True
commission_pct = 0.001
enable_slippage = True
slippage_volume_limit = 0.025
slippage_impact = 0.05
"""
Helper functions.
"""
def momentum_score(ts):
"""
Input: Price time series.
Output: Annualized exponential regression slope,
multiplied by the R2
"""
# Make a list of consecutive numbers
x = np.arange(len(ts))
# Get logs
log_ts = np.log(ts)
# Calculate regression values
slope, intercept, r_value, p_value, std_err = stats.linregress(x, log_ts)
# Annualize percent
annualized_slope = (np.power(np.exp(slope), 252) - 1) * 100
#Adjust for fitness
score = annualized_slope * (r_value ** 2)
return score
def volatility(ts):
return ts.pct_change().rolling(vola_window).std().iloc[-1]
"""
Initialization and trading logic
"""
def make_pipeline():
indexconstituent = NorgateDataIndexConstituent('$SPX')
return Pipeline(
columns={
'NorgateDataIndexConstituent':indexconstituent},
screen = indexconstituent)
def initialize(context):
attach_pipeline(make_pipeline(), 'norgatedata_pipeline', chunks=9999,eager=True)
# Set commission and slippage.
if enable_commission:
comm_model = PerDollar(cost=commission_pct)
else:
comm_model = PerDollar(cost=0.0)
set_commission(comm_model)
if enable_slippage:
slippage_model=VolumeShareSlippage(volume_limit=slippage_volume_limit, price_impact=slippage_impact)
set_slippage(slippage_model)
else:
slippage_model=FixedSlippage(spread=0.0)
# Used only for progress output.
context.last_month = intial_portfolio
# Store index membership
#context.index_members = pd.read_csv('../data/index_members/sp500.csv', index_col=0, parse_dates=[0])
#Schedule rebalance monthly.
schedule_function(
func=rebalance,
date_rule=date_rules.month_start(),
time_rule=time_rules.market_open()
)
def output_progress(context):
"""
Output some performance numbers during backtest run
"""
# Get today's date
today = zipline.api.get_datetime().date()
# Calculate percent difference since last month
perf_pct = (context.portfolio.portfolio_value / context.last_month) - 1
# Print performance, format as percent with two decimals.
print("{} - Last Month Result: {:.2%}".format(today, perf_pct))
# Remember today's portfolio value for next month's calculation
context.last_month = context.portfolio.portfolio_value
def rebalance(context, data):
# Write some progress output during the backtest
output_progress(context)
context.pipeline_data = pipeline_output('norgatedata_pipeline')
todays_universe = context.pipeline_data.index
# Check how long history window we need.
hist_window = max(momentum_window1,
momentum_window2)
# Get historical data
hist = data.history(todays_universe, "close", hist_window, "1d")
# Slice the history to match the two chosen time frames.
momentum_hist1 = hist[(-1 * momentum_window1):]
momentum_hist2 = hist[(-1 * momentum_window2):]
# Calculate momentum values for the two time frames.
momentum_list1 = momentum_hist1.apply(momentum_score)
momentum_list2 = momentum_hist2.apply(momentum_score)
# Now let's put the two momentum values together, and calculate mean.
momentum_concat = pd.concat((momentum_list1, momentum_list2))
mom_by_row = momentum_concat.groupby(momentum_concat.index)
mom_means = mom_by_row.mean()
# Sort by momentum value.
ranking_table = mom_means.sort_values(ascending=False)
"""
Sell Logic
First we check if any existing position should be sold.
* Sell if stock is no longer part of index.
* Sell if stock has too low momentum value.
"""
kept_positions = list(context.portfolio.positions.keys())
for security in context.portfolio.positions:
if (security not in todays_universe):
order_target_percent(security, 0.0)
kept_positions.remove(security)
elif ranking_table[security] < minimum_momentum:
order_target_percent(security, 0.0)
kept_positions.remove(security)
"""
Trend Filter Section
"""
if enable_trend_filter:
ind_hist = data.history(
symbol(trend_filter_symbol),
'close',
trend_filter_window,
'1d'
)
trend_filter = ind_hist.iloc[-1] > ind_hist.mean()
if trend_filter == False:
return
"""
Stock Selection Logic
Check how many stocks we are keeping from last month.
Fill from top of ranking list, until we reach the
desired total number of portfolio holdings.
"""
replacement_stocks = portfolio_size - len(kept_positions)
buy_list = ranking_table.loc[
~ranking_table.index.isin(kept_positions)][:replacement_stocks]
new_portfolio = pd.concat(
(buy_list,
ranking_table.loc[ranking_table.index.isin(kept_positions)])
)
"""
Calculate inverse volatility for stocks,
and make target position weights.
"""
vola_table = hist[new_portfolio.index].apply(volatility)
inv_vola_table = 1 / vola_table
sum_inv_vola = np.sum(inv_vola_table)
vola_target_weights = inv_vola_table / sum_inv_vola
for security, rank in new_portfolio.iteritems():
weight = vola_target_weights[security]
if security in kept_positions:
order_target_percent(security, weight)
else:
if ranking_table[security] > minimum_momentum:
order_target_percent(security, weight)
def analyze(context, perf):
perf['max'] = perf.portfolio_value.cummax()
perf['dd'] = (perf.portfolio_value / perf['max']) - 1
maxdd = perf['dd'].min()
ann_ret = (np.power((perf.portfolio_value.iloc[-1] / perf.portfolio_value.iloc[0]),(252 / len(perf)))) - 1
print("Annualized Return: {:.2%} Max Drawdown: {:.2%}".format(ann_ret, maxdd))
return
start_date = Timestamp('2015-01-01',tz='UTC')
end_date = Timestamp('2020-03-14',tz='UTC')
perf = zipline.run_algorithm(
start=start_date, end=end_date,
initialize=initialize,
analyze=analyze,
capital_base=intial_portfolio,
data_frequency = 'daily',
bundle='norgatedata-sp500' )

Related

cumsum() on running streak of data

i'm attempting to determine the sum of a column for a running period where there is a negative gain - ie i want to determine the total for a loosing streak.
i've set up a column that provides the numerical number of days where its been loosing (Consecutive Losses), but i wish to sum up the same for the total loss throughout the streak. what i have (Aggregate Consecutive Loss) 1) doesn't work (because it just cumsums() without resetting to zero at each streak) and 2) is incorrectly as i should in fact take the Open value at the start of the streak and Close value at the end.
how can i correctly setup this Aggregate Consecutive Loss value in pandas?
import pandas as pd
import numpy as np
import yfinance as yf
def get( symbols, group_by_ticker=False, **kwargs ):
if not isinstance(symbols, list):
symbols = [ symbols, ]
kwargs['auto_adjust'] = True
kwargs['prepost'] = True
kwargs['threads'] = True
df = None
if group_by_ticker:
kwargs['group_by'] = 'ticker'
df = yf.download( symbols, **kwargs)
for t in symbols:
df["Change Percent",t] = df["Close",t].pct_change() * 100
df["Gain",t] = np.where( df['Change Percent',t] > 0, True, False ).astype('bool')
a = df['Gain',t] != True
df['Consecutive Losses',t] = a.cumsum() - a.cumsum().where(~a).ffill().fillna(0).astype(int)
x = df['Change Percent',t].where( df['Consecutive Losses',t] > 0 )
df['Aggregate Consecutive Loss',t] = x.cumsum() - x.cumsum().where(~a).ffill().fillna(0).astype(float)
return df
data = get( ["DOW", "IDX"], period="6mo")
data[['Change Percent','Gain','Consecutive Losses','Aggregate Consecutive Loss']].head(50)

Optimization Python

I am trying to get the optimal solution
column heading: D_name , Vial_size1 ,Vial_size2 ,Vial_size3 , cost , units_needed
row 1: Act , 120 , 400 , 0 , $5 , 738
row 2: dug , 80 , 200 , 400 , $40 , 262
data in excel
column heading: Vials price size
Row 1: Vial size 1 5 120
Row 2: Vial size 2 5 400
prob=LpProblem("Dose_Vial",LpMinimize)
import pandas as pd
df = pd.read_excel (r'C:\Users\*****\Desktop\Vial.xls')
print (df)
# Create a list of the Vial_Size
Vial_Size = list(df['Vials'])
# Create a dictinary of units for all Vial_Size
size = dict(zip(Vial_Size,df['size']))
# Create a dictinary of price for all Vial_Size
Price = dict(zip(Vial_Size,df['Price']))
# print dictionaries
print(Vial_Size)
print(size)
print(Price)
vial_vars = LpVariable.dicts("Vials",size,lowBound=0,cat='Integer')
# start building the LP problem by adding the main objective function
prob += lpSum([Price[i]*vial_vars[i]*size[i] for i in size])
# adding constraints
prob += lpSum([size[f] * vial_vars[f] for f in size]) >= 738
# The status of the solution is printed to the screen
prob.solve()
print("Status:", LpStatus[prob.status])
# In case the problem is ill-formulated or there is not sufficient information,
# the solution may be infeasible or unbounded
for v in prob.variables():
if v.varValue>0:
print(v.name, "=", format(round(v.varValue)))
Vials_Vial_Size_1 = 3
Vials_Vial_Size_2 = 1
obj =round((value(prob.objective)))
print("The total cost of optimized vials: ${}".format(round(obj)))
The total cost of optimized vials: $3800
'
how to set it for 2 or more drugs and get the best optimal solution.
Here is an approach to solve the first part of the question, finding vial combinations that minimizes the waste (I'm not sure what role the price plays?):
from pulp import *
import pandas as pd
import csv
drugs_dict = {"D_name": ['Act', 'dug'],
"Vial_size1": [120, 80],
"Vial_size2": [400, 200],
"Vial_size3": [0, 400],
"cost": [5, 40],
"units_needed": [738, 262]}
df = pd.DataFrame(drugs_dict)
drugs = list(df['D_name'])
vial_1_size = dict(zip(drugs, drugs_dict["Vial_size1"]))
vial_2_size = dict(zip(drugs, drugs_dict["Vial_size2"]))
vial_3_size = dict(zip(drugs, drugs_dict["Vial_size3"]))
units_needed = dict(zip(drugs, drugs_dict["units_needed"]))
results = []
for drug in drugs:
print(f"drug = {drug}")
# setup minimum waste problem
prob = LpProblem("Minimum Waste Problem", LpMinimize)
# create decision variables
vial_1_var = LpVariable("Vial_1", lowBound=0, cat='Integer')
vial_2_var = LpVariable("Vial_2", lowBound=0, cat='Integer')
vial_3_var = LpVariable("Vial_3", lowBound=0, cat='Integer')
units = lpSum([vial_1_size[drug] * vial_1_var +
vial_2_size[drug] * vial_2_var +
vial_3_size[drug] * vial_3_var])
# objective function
prob += units
# constraints
prob += units >= units_needed[drug]
prob.solve()
print(f"units = {units.value()}")
for v in prob.variables():
if v.varValue > 0:
print(v.name, "=", v.varValue)
results.append([drug, units.value(), int(vial_1_var.value() or 0), int(vial_2_var.value() or 0), int(vial_3_var.value() or 0)])
with open('vial_results.csv', 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['drug', 'units', 'vial_1', 'vial_2', 'vial_3'])
csv_writer.writerows(results)
Running gives:
drug = Act
units = 760.0
Vial_1 = 3.0
Vial_2 = 1.0
drug = dug
units = 280.0
Vial_1 = 1.0
Vial_2 = 1.0

Pandas accumulate data for linear regression

I try to adjust my data so total_gross per day is accumulated. E.g.
`Created` `total_gross` `total_gross_accumulated`
Day 1 100 100
Day 2 100 200
Day 3 100 300
Day 4 100 400
Any idea, how I have to change my code to have total_gross_accumulated available?
Here is my data.
my code:
from sklearn import linear_model
def load_event_data():
df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'])
df['created'] = pd.to_datetime(df.created)
return df.set_index('created').resample('D').sum().fillna(0)
event_data = load_event_data()
X = event_data.index
y = event_data.total_gross
plt.xticks(rotation=90)
plt.plot(X, y)
plt.show()
List comprehension is the most pythonic way to do this.
SHORT answer:
This should give you the new column that you want:
n = event_data.shape[0]
# skip line 0 and start by accumulating from 1 until the end
total_gross_accumulated =[event_data['total_gross'][:i].sum() for i in range(1,n+1)]
# add the new variable in the initial pandas dataframe
event_data['total_gross_accumulated'] = total_gross_accumulated
OR faster
event_data['total_gross_accumulated'] = event_data['total_gross'].cumsum()
LONG answer:
Full code using your data:
import pandas as pd
def load_event_data():
df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'])
df['created'] = pd.to_datetime(df.created)
return df.set_index('created').resample('D').sum().fillna(0)
event_data = load_event_data()
n = event_data.shape[0]
# skip line 0 and start by accumulating from 1 until the end
total_gross_accumulated =[event_data['total_gross'][:i].sum() for i in range(1,n+1)]
# add the new variable in the initial pandas dataframe
event_data['total_gross_accumulated'] = total_gross_accumulated
Results:
event_data.head(6)
# total_gross total_gross_accumulated
#created
#2019-03-01 3481810 3481810
#2019-03-02 4690 3486500
#2019-03-03 0 3486500
#2019-03-04 0 3486500
#2019-03-05 0 3486500
#2019-03-06 0 3486500
X = event_data.index
y = event_data.total_gross_accumulated
plt.xticks(rotation=90)
plt.plot(X, y)
plt.show()

Time Difference between Time Period and Instant

I have some time periods (df_A) and some time instants (df_B):
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
# Data
df_A = pd.DataFrame({'A1': [dt.datetime(2017,1,5,9,8), dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,1,7,9,19), dt.datetime(2017,2,7,9,19), dt.datetime(2017,2,7,9,19)],
'A2': [dt.datetime(2017,1,5,9,9), dt.datetime(2017,1,5,9,12), dt.datetime(2017,1,7,9,26), dt.datetime(2017,1,7,9,20), dt.datetime(2017,1,7,9,21), dt.datetime(2017,2,7,9,23), dt.datetime(2017,2,7,9,25)]})
df_B = pd.DataFrame({ 'B': [dt.datetime(2017,1,6,14,45), dt.datetime(2017,1,4,3,31), dt.datetime(2017,1,7,3,31), dt.datetime(2017,1,7,14,57), dt.datetime(2017,1,9,14,57)]})
I can match these together:
# Define an Extra Margin
M = dt.timedelta(days = 10)
df_A["A1X"] = df_A["A1"] + M
df_A["A2X"] = df_A["A2"] - M
# Match
Bv = df_B .B .values
A1 = df_A .A1X.values
A2 = df_A .A2X.values
i, j = np.where((Bv[:, None] >= A1) & (Bv[:, None] <= A2))
df_C = pd.DataFrame(np.column_stack([df_B .values[i], df_A .values[j]]),
columns = df_B .columns .append (df_A.columns))
I would like to find the time difference between each time period and the time instant matched to it. I mean that
if B is between A1 and A2
then dT = 0
I've tried doing it like this:
# Calculate dt
def time(A1,A2,B):
if df_C["B"] < df_C["A1"]:
return df_C["A1"].subtract(df_C["B"])
elif df_C["B"] > df_C["A2"]:
return df_C["B"].subtract(df_C["A2"])
else:
return 0
df_C['dt'] = df_C.apply(time)
I'm getting "ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series"
So, I found two fixes:
You are adding M to the lower value and subtracting from the higher one. Change it to:
df_A['A1X'] = df_A['A1'] - M
df_A['A2X'] = df_A['A2'] + M
You are only passing one row of your dataframe at a time to your time function, so it should be something like:
def time(row):
if row['B'] < row['A1']:
return row['A1'] - row['B']
elif row['B'] > row['A2']:
return row['B'] - row['A2']
else:
return 0
And then you can call it like this:
df_C['dt'] = df_C.apply(time, axis=1) :)

dask how to define a custom (time fold) function that operates in parallel and returns a dataframe with a different shape

I am trying to implement a time fold function to be 'map'ed to various partitions of a dask dataframe which in turn changes the shape of the dataframe in question (or alternatively produces a new dataframe with the altered shape). This is how far I have gotten. The result 'res' returned on compute is a list of 3 delayed objects. When I try to compute each of them in a loop (last tow lines of code) this results in a "TypeError: 'DataFrame' object is not callable" After going through the examples for map_partitions, I also tried altering the input DF (inplace) in the function with no return value which causes a similar TypeError with NoneType. What am I missing?
Also, looking at the visualization (attached) I feel like there is a need for reducing the individually computed (folded) partitions into a single DF. How do I do this?
#! /usr/bin/env python
# Start dask scheduler and workers
# dask-scheduler &
# dask-worker --nthreads 1 --nprocs 6 --memory-limit 3GB localhost:8786 --local-directory /dev/shm &
from dask.distributed import Client
from dask.delayed import delayed
import pandas as pd
import numpy as np
import dask.dataframe as dd
import math
foldbucketsecs=30
periodicitysecs=15
secsinday=24 * 60 * 60
chunksizesecs=60 # 1 minute
numts = 5
start = 1525132800 # 01/05
end = 1525132800 + (3 * 60) # 3 minute
c = Client('127.0.0.1:8786')
def fold(df, start, bucket):
return df
def reduce_folds(df):
return df
def load(epoch):
idx = []
for ts in range(0, chunksizesecs, periodicitysecs):
idx.append(epoch + ts)
d = np.random.rand(chunksizesecs/periodicitysecs, numts)
ts = []
for i in range(0, numts):
tsname = "ts_%s" % (i)
ts.append(tsname)
gts.append(tsname)
res = pd.DataFrame(index=idx, data=d, columns=ts, dtype=np.float64)
res.index = pd.to_datetime(arg=res.index, unit='s')
return res
gts = []
load(start)
cols = len(gts)
idx1 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+periodicitysecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx1[:0], data=[], columns=gts, dtype=np.float64)
dfs = [delayed(load)(fn) for fn in range(start, end, chunksizesecs)]
from_delayed = dd.from_delayed(dfs, meta, 'sorted')
nfolds = int(math.ceil((end - start)/foldbucketsecs))
cprime = nfolds * cols
gtsnew = []
for i in range(0, cprime):
gtsnew.append("ts_%s,fold=%s" % (i%cols, i/cols))
idx2 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+foldbucketsecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx2[:0], data=[], columns=gtsnew, dtype=np.float64)
folded_df = from_delayed.map_partitions(delayed(fold)(from_delayed, start, foldbucketsecs), meta=meta)
result = c.submit(reduce_folds, folded_df)
c.gather(result).visualize(filename='/usr/share/nginx/html/svg/df4.svg')
res = c.gather(result).compute()
for f in res:
f.compute()
Never mind! It was my fault, instead of wrapping my function in delayed I simply passed it to the map_partitions call like so and it worked.
folded_df = from_delayed.map_partitions(fold, start, foldbucketsecs, nfolds, meta=meta)