KeyError' in Python - pandas

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
dt = pd.read_csv("C:\Subhro\ML_Internship\MARUTI_2.csv")
data = pd.DataFrame(dt)
data = data.drop('Date',axis=1)
data.drop(['Unnamed: 0'],axis=1,inplace=True)
print(data)
Roll_Mean_14 = data['Close Price'].rolling(window=14).mean()
Standard_Dev_14 = data['Close Price'].rolling(window=14).mean().std()
Upper_Band_14 = data['Close Price'].rolling(window=14).mean() + (2*Standard_Dev_14)
Low_Band_14 = data['Close Price'].rolling(window=14).mean() - (2*Standard_Dev_14)
avg_stock_price = data['Average Price']
stock_price = data['Close Price']
data['Roll_Avg'] = Roll_Mean_14
data['Upper_Band'] = Upper_Band_14
data['Lower_Band'] = Low_Band_14
data['Avg_Stock_Price'] = avg_stock_price
data=data.drop(data.head(14).index, inplace=False)
print(data)
for i in (data):
if((data['Close Price'][i])<(data['Lower_Band'][i])):
data['Call'][i]='Buy'
elif((data['Close Price'][i])>(data['Lower Band'][i])) and ((data['Close Price'][i])<(data['Roll_Avg'])):
data['Call'][i]='Hold Buy/Liquidate Short'
elif((data['Close Price'][i])>(data['Roll_Avg'][i])) and ((data['Close Price'][i])<(data['Upper Band'])):
data['Call'][i]='Hold Short/Liquidate Buy'
elif((data['Close Price'][i])>(data['Upper_Band'])):
data['Call'][i]='Short'
print(data)
In this code, I have been creating a new column : 'Call' to print the categories 'Buy','Short','Hold Buy/Liquidate Short', 'Hold Short/Liquidate Buy' according to the conditions given in the code. On running the code it is showing me the error as
KeyError : 'Symbol' in line
if((data['Close Price'][i])<(data['Lower_Band'][i])):

Your manner of accessing the indexes of the dataframe is incorrect.
You could try this :
for i in data.index:
if((data[i]['Close Price'])<(data[i]['Lower_Band'])):
The way you access a particular value(cell) in a dataframe(table) is :
data[row_index][column_index]

Related

ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records')

I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.

Pandas DataFrame - How to extract string patterns with hidden characters

I am scraping names, prices and images from this website. There are 8 items in total, but in the DF I would like to filter only the items that contain the pattern "Original Zaino Antifurto". When I try to apply the bp_filter to the DF I get an error, probably due to hidden characters.
Does anyone know how to filter for this pattern avoiding the error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = []
prices_xd = []
picts_xd = []
for container in con_xd:
name = container.find("a", class_="product-item-link").text
names_xd.append(name)
for container in con_xd:
price = container.find("span", class_="price").text
prices_xd.append(price)
for container in con_xd:
pict = container.find("a").get("href")
picts_xd.append(pict)
bp_xd = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
bp_xd['Item_Price_EUR'] = bp_xd['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
bp_xd['(XD-Design) Item_Name'] = bp_xd['(XD-Design) Item_Name'].str.strip()
bp_filter = bp_xd['(XD-Design) Item_Name'][bp_xd['(XD-Design) Item_Name'].str.contains('Original Zaino Antifurto')]
# bp_xd[bp_filter]
Here you have the fixed working code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = [c.find("a", class_="product-item-link").text for c in con_xd]
prices_xd = [c.find("span", class_="price").text for c in con_xd]
picts_xd = [c.find("a").get("href") for c in con_xd]
df = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
df['Item_Price_EUR'] = df['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
df['(XD-Design) Item_Name'] = df['(XD-Design) Item_Name'].str.strip()
df = df.loc[df['(XD-Design) Item_Name'].apply(lambda x: 1 if 'Original Zaino Antifurto' in x else 0) == 1]

Bokeh: Bad date format?

would anyone advise me how to adjust the X axis to better display the date on this graph?
from math import pi
import pandas as pd
from bokeh.io import show
from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
#cesta k souboru
path = "C://Users//Zemi4//Desktop//zpr3//all2.csv"
#nacteni dataframu
data = pd.read_csv(path, delimiter = ",")
data['Cas'] = data['Cas'].astype(str)
data = data.set_index('Cas')
data.columns.name = 'Mistnost'
times = list(data.index)
rooms = list(data.columns)
df = pd.DataFrame(data.stack(), columns=['float']).reset_index()
colors = ['#440154', '#404387', '#29788E', '#22A784', '#79D151', '#FDE724', '#FCFEA4', '#FBA40A', '#DC5039']
mapper = LinearColorMapper(palette=colors, low=df.float.min(), high=df.float.max())
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Heatmap ({0} - {1})".format(times[0], times[-1]),
x_range=times, y_range=list(reversed(rooms)),
x_axis_location="above", plot_width=1500, plot_height=900,
tools=TOOLS, toolbar_location='below',
tooltips=[('Time: ', '#Cas'), ('Temperature: ', '#float'), ('Room: ', '#Mistnost')],
x_axis_type='datetime')
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3
p.rect(x="Cas", y="Mistnost", width=1, height=1,
source=df,
fill_color={'field': 'float', 'transform': mapper},
line_color=None)
color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
ticker=BasicTicker(desired_num_ticks=len(colors)),
formatter=PrintfTickFormatter(format="%f"),
label_standoff=6, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')
show(p) # show the pl
Try: p.xaxis[0].ticker.desired_num_ticks = <number_ticks_you_want_to_display>.
Or apply a specific ticker (see Bokeh docs) like you did for the ColorBar.

zipline error KeyError: <type 'zipline.assets._assets.Equity'>

When I try to execute a simple crossover strategy algorithm outside quantopian framework using zipline, I get the following error.
KeyError: <type 'zipline.assets._assets.Equity'>
This is a simple crossover strategy where 50-100 day moving averages are calculated to derive trading strategy. I am unable to run this strategy out of Quantopian framework using zipline.
Code is as follows
import pandas as pd
import zipline
from zipline import TradingAlgorithm
from zipline.api import order, sid
from zipline.utils.factory import load_from_yahoo
import matplotlib.pyplot as plt
from zipline.api import order, symbol, record, order_target
import pytz
%matplotlib inline
# creating time interval
start = pd.Timestamp('2013-01-25', tz='UTC')
end = pd.Timestamp('2017-02-01', tz='UTC')
#input_date = get_pricing(['AAPL'],start,end,frequency='daily')
# loading the data
#input_data = load_bars_from_yahoo(stocks=['AAPL'], start=start,end=end,)
data = load_from_yahoo(stocks=['AAPL'], indexes={}, start=start, end=end)
data = data.dropna()
def initialize(context):
context.security= symbol('AAPL')
context.i =0
def handle_data(context, data):
context.i += 1
if context.i<100:
return
MA1 = data[context.security].mavg(50)
MA2 = data[context.security].mavg(100)
date = str(data[context.security].datetime)[:10]
current_price = data[context.security].price
current_positions = context.portfolio.positions[symbol('AAPL')].amount
cash = context.portfolio.cash
value = context.portfolio.portfolio_value
current_pnl = context.portfolio.pnl
if (MA1 > MA2) and current_positions == 0:
number_of_shares = 100
order(context.security, number_of_shares)
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price=
current_price,status="buy",shares=number_of_shares,PnL=current_pnl,cash=cash,value=value)
elif (MA1 < MA2) and current_positions != 0:
order_target(context.security, 0)
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price= current_price,status="sell",shares="--",PnL=current_pnl,cash=cash,value=value)
else:
record(AAPL=inputdata[symbol('AAPL')].price,date=date,MA1 = MA1, MA2 = MA2, Price= current_price,status="--",shares="--",PnL=current_pnl,cash=cash,value=value)
algo = TradingAlgorithm(initialize=initialize, handle_data=handle_data)
results = algo.run(input_data)
use codes as below to calculate MA1 and MA2, then it works!
because some of the function is out of date in zipline 1.1.0
from talib import MA
trailing_window = data.history(assets=context.security, fields='price', bar_count=100, frequency='1d')
MA1 = MA(trailing_window.values, 50)[-1]
MA2 = MA(trailing_window.values, 100)[-1]
or use the codes as below without using talib:
trailing_window1 = data.history(assets=context.security, fields='price', bar_count=50, frequency='1d')
trailing_window2 = data.history(assets=context.security, fields='price', bar_count=100, frequency='1d')
MA1 = trailing_window1.mean()
MA2 = trailing_window2.mean()

Concatenate Data From URLS Recursively Inside one DataFrame

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!
import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'
pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})
print frame
Use pd.concat:
frames = []
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))
pd.concat(frames)
You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.
import urllib
import re
import pandas as pd
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
'date' : re.compile('<div class="date">(.+?)</div>'),
'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}
# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
d[k] = []
# Now fill those lists
for url in urls:
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
for k, v in regex.iteritems():
d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame