zipline error KeyError: <type 'zipline.assets._assets.Equity'> - pandas

When I try to execute a simple crossover strategy algorithm outside quantopian framework using zipline, I get the following error.
KeyError: <type 'zipline.assets._assets.Equity'>
This is a simple crossover strategy where 50-100 day moving averages are calculated to derive trading strategy. I am unable to run this strategy out of Quantopian framework using zipline.
Code is as follows
import pandas as pd
import zipline
from zipline import TradingAlgorithm
from zipline.api import order, sid
from zipline.utils.factory import load_from_yahoo
import matplotlib.pyplot as plt
from zipline.api import order, symbol, record, order_target
import pytz
%matplotlib inline
# creating time interval
start = pd.Timestamp('2013-01-25', tz='UTC')
end = pd.Timestamp('2017-02-01', tz='UTC')
#input_date = get_pricing(['AAPL'],start,end,frequency='daily')
# loading the data
#input_data = load_bars_from_yahoo(stocks=['AAPL'], start=start,end=end,)
data = load_from_yahoo(stocks=['AAPL'], indexes={}, start=start, end=end)
data = data.dropna()
def initialize(context):
context.security= symbol('AAPL')
context.i =0
def handle_data(context, data):
context.i += 1
if context.i<100:
return
MA1 = data[context.security].mavg(50)
MA2 = data[context.security].mavg(100)
date = str(data[context.security].datetime)[:10]
current_price = data[context.security].price
current_positions = context.portfolio.positions[symbol('AAPL')].amount
cash = context.portfolio.cash
value = context.portfolio.portfolio_value
current_pnl = context.portfolio.pnl
if (MA1 > MA2) and current_positions == 0:
number_of_shares = 100
order(context.security, number_of_shares)
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price=
current_price,status="buy",shares=number_of_shares,PnL=current_pnl,cash=cash,value=value)
elif (MA1 < MA2) and current_positions != 0:
order_target(context.security, 0)
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price= current_price,status="sell",shares="--",PnL=current_pnl,cash=cash,value=value)
else:
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price= current_price,status="--",shares="--",PnL=current_pnl,cash=cash,value=value)
algo = TradingAlgorithm(initialize=initialize, handle_data=handle_data)
results = algo.run(input_data)

use codes as below to calculate MA1 and MA2, then it works!
because some of the function is out of date in zipline 1.1.0
from talib import MA
trailing_window = data.history(assets=context.security, fields='price', bar_count=100, frequency='1d')
MA1 = MA(trailing_window.values, 50)[-1]
MA2 = MA(trailing_window.values, 100)[-1]
or use the codes as below without using talib:
trailing_window1 = data.history(assets=context.security, fields='price', bar_count=50, frequency='1d')
trailing_window2 = data.history(assets=context.security, fields='price', bar_count=100, frequency='1d')
MA1 = trailing_window1.mean()
MA2 = trailing_window2.mean()

Related

ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records')

I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.

can i use OR-tools for TSP with partial distance matrix (for a huge set of nodes)?

i'm trying to solve tsp with OR-tools for a problem of something like 80,000 nodes, the problem is, I need a huge distance matrix that takes to much memory ,so its infeasible and i don't get a solution.
so:
is there an option to work with partial distance matrix in or-tools?
if not is there a way to improve my code?
is there another external solver that can work for this task in python?
import math
from collections import namedtuple
import random
import time
from collections import namedtuple
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import numba
from scipy.spatial import distance_matrix
from sklearn.metrics.pairwise import euclidean_distances
from math import sqrt
Point = namedtuple("Point", ['x', 'y'])
def solve_it(input_data):
# Modify this code to run your optimization algorithm
global POINTS
# parse the input
lines = input_data.split('\n')
nodeCount = int(lines[0])
points = []
for i in range(1, nodeCount+1):
line = lines[i]
parts = line.split()
points.append(Point(float(parts[0]), float(parts[1])))
#2.routing with or tools
def dist_matrix(nodeCount,points):
data=[]
for k in range(len(points)):
data.append([int(points[k].x),int(points[k].y)])
D=euclidean_distances(data, data)
return D
def create_data_model(D):
"""Stores the data for the problem."""
data = {}
data['distance_matrix'] = D # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def print_solution(manager, routing, solution):
index = routing.Start(0)
plan_output = []#Route for vehicle 0:\n'
route_distance = 0
while not routing.IsEnd(index):
plan_output.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
return plan_output
def or_main(nodeCount,points):
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
"""Entry point of the program."""
# Instantiate the data problem.
global sol
D=dist_matrix(nodeCount,points)
data = create_data_model(D)
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
"""Returns the distance between the two nodes."""
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.local_search_metaheuristic = (
routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
k = 100
if nodeCount <= 100:
k = 30
elif 100 <= nodeCount <= 1000:
k = 300
elif nodeCount > 1000:
k = 17000
search_parameters.time_limit.seconds =k
search_parameters.log_search = True
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
# #print solution on console.
if solution:
sol=print_solution(manager, routing, solution)
return sol
######################################################################
solution=or_main(nodeCount,points)
# calculate the length of the tour
obj = length(points[solution[-1]], points[solution[0]])
for index in range(0, nodeCount-1):
obj += length(points[solution[index]], points[solution[index+1]])
# prepare the solution in the specified output format
output_data = '%.2f' % obj + ' ' + str(0) + '\n'
output_data += ' '.join(map(str, solution))
return output_data
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
file_location = sys.argv[1].strip()
with open(file_location, 'r') as input_data_file:
input_data = input_data_file.read()
#print(solve_it(input_data))
else:
print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/tsp_51_1)')

KeyError' in Python

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
dt = pd.read_csv("C:\Subhro\ML_Internship\MARUTI_2.csv")
data = pd.DataFrame(dt)
data = data.drop('Date',axis=1)
data.drop(['Unnamed: 0'],axis=1,inplace=True)
print(data)
Roll_Mean_14 = data['Close Price'].rolling(window=14).mean()
Standard_Dev_14 = data['Close Price'].rolling(window=14).mean().std()
Upper_Band_14 = data['Close Price'].rolling(window=14).mean() + (2*Standard_Dev_14)
Low_Band_14 = data['Close Price'].rolling(window=14).mean() - (2*Standard_Dev_14)
avg_stock_price = data['Average Price']
stock_price = data['Close Price']
data['Roll_Avg'] = Roll_Mean_14
data['Upper_Band'] = Upper_Band_14
data['Lower_Band'] = Low_Band_14
data['Avg_Stock_Price'] = avg_stock_price
data=data.drop(data.head(14).index, inplace=False)
print(data)
for i in (data):
if((data['Close Price'][i])<(data['Lower_Band'][i])):
data['Call'][i]='Buy'
elif((data['Close Price'][i])>(data['Lower Band'][i])) and ((data['Close Price'][i])<(data['Roll_Avg'])):
data['Call'][i]='Hold Buy/Liquidate Short'
elif((data['Close Price'][i])>(data['Roll_Avg'][i])) and ((data['Close Price'][i])<(data['Upper Band'])):
data['Call'][i]='Hold Short/Liquidate Buy'
elif((data['Close Price'][i])>(data['Upper_Band'])):
data['Call'][i]='Short'
print(data)
In this code, I have been creating a new column : 'Call' to print the categories 'Buy','Short','Hold Buy/Liquidate Short', 'Hold Short/Liquidate Buy' according to the conditions given in the code. On running the code it is showing me the error as
KeyError : 'Symbol' in line
if((data['Close Price'][i])<(data['Lower_Band'][i])):
Your manner of accessing the indexes of the dataframe is incorrect.
You could try this :
for i in data.index:
if((data[i]['Close Price'])<(data[i]['Lower_Band'])):
The way you access a particular value(cell) in a dataframe(table) is :
data[row_index][column_index]

could not convert string to float in python

i try to analysis the Principle Component from cvs file but when i run the code i get this error
C:\Users\Lenovo\Desktop>python pca.py
ValueError: could not convert string to float: Annee;NET;INT;SUB;LMT;DCT;IMM;EXP;VRD
this is my cvs file
i try to remove any space and any think
this is my python script, i don't know what i miss
Note: i run this code under python2.7
from sklearn.externals import joblib
import numpy as np
import glob
import os
import time
import numpy
my_matrix = numpy.loadtxt(open("pca.csv","rb"),delimiter= ",",skiprows=0)
def pca(dataMat, r, autoset_r=False, autoset_rate=0.9):
"""
purpose: principal components analysis
"""
print("Start to do PCA...")
t1 = time.time()
meanVal = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVal
# normData = meanRemoved / np.std(dataMat)
covMat = np.cov(meanRemoved, rowvar=0)
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
eigValIndex = np.argsort(-eigVals)
if autoset_r:
r = autoset_eigNum(eigVals, autoset_rate)
print("autoset: take top {} of {} features".format(r, meanRemoved.shape[1]))
r_eigValIndex = eigValIndex[:r]
r_eigVect = eigVects[:, r_eigValIndex]
lowDDataMat = meanRemoved * r_eigVect
reconMat = (lowDDataMat * r_eigVect.T) + meanVal
t2 = time.time()
print("PCA takes %f seconds" %(t2-t1))
joblib.dump(r_eigVect, './pca_args_save/r_eigVect.eig')
joblib.dump(meanVal, './pca_args_save/meanVal.mean')
return lowDDataMat, reconMat
def autoset_eigNum(eigValues, rate=0.99):
eigValues_sorted = sorted(eigValues, reverse=True)
eigVals_total = eigValues.sum()
for i in range(1, len(eigValues_sorted)+1):
eigVals_sum = sum(eigValues_sorted[:i])
if eigVals_sum / eigVals_total >= rate:
break
return i
It seemed that NumPy has some problem parsing your index row to float.
Try setting skiprows = 1 in your np.readtxt command in order to skip the table header.

Bokeh: Bad date format?

would anyone advise me how to adjust the X axis to better display the date on this graph?
from math import pi
import pandas as pd
from bokeh.io import show
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
#cesta k souboru
path = "C://Users//Zemi4//Desktop//zpr3//all2.csv"
#nacteni dataframu
data = pd.read_csv(path, delimiter = ",")
data['Cas'] = data['Cas'].astype(str)
data = data.set_index('Cas')
data.columns.name = 'Mistnost'
times = list(data.index)
rooms = list(data.columns)
df = pd.DataFrame(data.stack(), columns=['float']).reset_index()
colors = ['#440154', '#404387', '#29788E', '#22A784', '#79D151', '#FDE724', '#FCFEA4', '#FBA40A', '#DC5039']
mapper = LinearColorMapper(palette=colors, low=df.float.min(), high=df.float.max())
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Heatmap ({0} - {1})".format(times[0], times[-1]),
x_range=times, y_range=list(reversed(rooms)),
x_axis_location="above", plot_width=1500, plot_height=900,
tools=TOOLS, toolbar_location='below',
tooltips=[('Time: ', '#Cas'), ('Temperature: ', '#float'), ('Room: ', '#Mistnost')],
x_axis_type='datetime')
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3
p.rect(x="Cas", y="Mistnost", width=1, height=1,
source=df,
fill_color={'field': 'float', 'transform': mapper},
line_color=None)
color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
ticker=BasicTicker(desired_num_ticks=len(colors)),
formatter=PrintfTickFormatter(format="%f"),
label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')
show(p) # show the pl
Try: p.xaxis[0].ticker.desired_num_ticks = <number_ticks_you_want_to_display>.
Or apply a specific ticker (see Bokeh docs) like you did for the ColorBar.