WebDriverException: When trying to scrap amazon for product title and price using Selenium

WebDriverException: When trying to scrap amazon for product title and price using Selenium - selenium

I'm attempting to scrape Amazon for iPhone 11 names and prices, but when I run the code, I get the following error:
The Error I get:
My code is as the following:
```
#First project
class CrawledInfo:
def __init__(self, product_name, product_price, cust_name = None, cust_location = None, rating = None, review = None, review_date = None)-> None:
self.cust_name = cust_name
self.cust_location = cust_location
self.product_name = product_name
self.product_price = product_price
self.rating = rating
self.review = review
self.review_date = review_date
class CrawlerBot:
def item(self, name):
count = 1
page = 1
pageIncrement = 1
maxRetrieves = 100
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
l = []
#Declaring options
options = Options()
options.headless = False
options.add_experimental_option('detach', True)
browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
browser.maximize_window()
browser.get(url)
browser.set_page_load_timeout(10)
while True:
try:
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Capture item name
xPathTitle = '//*[#id="search"]/div[1]/div[2]/div/span[3]/div[2]/div[' + str(count) + ']/div/span/div/div/div[2]/div[2]/div/div[1]/div/div/div[1]/h2/a/span'
title = browser.find_element_by_xpath(xPathTitle)
titleText = title.get_attribute('innerHTML').splitLines()[0]
title.click()
#Capture item price
xPathPrice = '//*[#id="price_inside_buybox"]'
price = browser.find_element_by_xpath(xPathPrice)
priceText = price.get_attribute('innerHTML').splitLines()
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
#Send the results to class CrawledInfo
info = CrawledInfo(titleText, priceText)
l.append(info)
count +=1
except Exception as e:
print('Exception: ', e)
count +=1
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
browser.close()
return l
#Creating the object
start_crawler = CrawlerBot()
with open('results', 'w', newline='', encoding='utf-8') as fileWriter:
dataWriter = csv.writer(fileWriter, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for dat in start_crawler.item('iphone 11'):
dataWriter.writerow([dat.product_name, dat.product_price])
```
Anyone who has an idea of what's wrong?
When my code is working write I'm expecting it to create a csv file with the names of iPhone 11 together with their prices

Related

Selenium Issue with Exec?

I am running a webscraper with selenium to get some data on the NBA. I have urls to get to the websites for each of the 30 teams, but when I run the code it only gets through a few of the urls and then crashes with the errors below being shown:
#web scraper
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import pandas as pd
import os
class NBAScraper:
def __init__(self):
#part 1
url = "https://www.nba.com/teams"
HTML = requests.get(url)
soup = BeautifulSoup(HTML.text, 'html.parser')
text = str(soup.find_all("a", "Anchor_anchor__cSc3P TeamFigureLink_teamFigureLink__uqnNO"))
ids = []
for i in range(0, 30):
hr = text.find("stats")
ids.append(text[(hr+11):(hr+21)])
text = text[(hr+22):]
#part 2
names = []
for j in range(0, 30):
url2 = "https://www.nba.com/stats/team/"+str(ids[j])+"/advanced"
HTML2 = requests.get(url2)
soup2 = BeautifulSoup(HTML2.text, 'html.parser')
##div class="TeamHeader_name__MmHlP
name = str(soup2.find("div", "TeamHeader_name__MmHlP"))
ni = name.find("div>")
ni2 = name.find("<!")
name1 = name[(ni+4):ni2]
name = name[ni2:]
ni3 = name.find("<div>")
name = name[(ni3+5):]
ni4 = name.find("</div>")
name2 = name[:ni4]
n = name1 + " " + name2
names.append(n)
##tbody class="Crom_body__UYOcU"
#part 3
offrtg = []
defrtg = []
reb = []
tov = []
efg = []
for k in range(0, 30):
self.driver = webdriver.Chrome()
url3 = "https://www.nba.com/stats/team/"+str(ids[k])+"/advanced"
self.driver.get(url3)
rndrhtml = self.driver.page_source
self.driver.close()
#self.driver.quit()
soup3 = BeautifulSoup(rndrhtml, 'html.parser')
ovrall = str(soup3.find("tbody", "Crom_body__UYOcU").find_all("td"))
for d in range(0, 13):
di = ovrall.find("<td>")
ovrall = ovrall[(di+4):]
#conditions
if d == 2:
di2 = ovrall.find("</td>")
offrtg.append(float(ovrall[:di2]))
elif d == 3:
di2 = ovrall.find("</td>")
defrtg.append(float(ovrall[:di2]))
elif d == 10:
di2 = ovrall.find("</td>")
reb.append(float(ovrall[:di2]))
elif d == 11:
di2 = ovrall.find("</td>")
tov.append(float(ovrall[:di2]))
elif d == 12:
di2 = ovrall.find("</td>")
efg.append(float(ovrall[:di2]))
#writing to excel
os.remove(r"C:\Users\jackm\OneDrive\Desktop\NBA\NBASTATS.xlsx")
d = {'Name': names, 'OFFRTG': offrtg, 'DEFRTG': defrtg, 'REB': reb,
'TOV': tov, 'EFG': efg}
df = pd.DataFrame(data=d)
df.to_excel(r"C:\Users\jackm\OneDrive\Desktop\NBA\NBASTATS.xlsx", sheet_name="STATS")
NBAScraper()
I tried to play around with the closing and quitting functions for the driver, or put the driver in a separate function and run it outside the class, but none of that worked. I realized through some testing that even if it's not inside a loop, selenium will throw the error for a url but run it fine the second time. I tried using implicit waits to solve this but to no avail.
Traceback (most recent call last):
File "C:\Program Files\Spyder\pkgs\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\users\jackm\spyder\nba.py", line 104, in <module>
NBAScraper()
File "c:\users\jackm\spyder\nba.py", line 71, in __init__
ovrall = str(soup3.find("tbody", "Crom_body__UYOcU").find_all("td"))
AttributeError: 'NoneType' object has no attribute 'find_all'

Oanda API - Issue Price - Instruments

I'm using Oanda API to automate Trading strategies, I have a 'price' error that only occurs when selecting some instruments such as XAG (silver), my guess is that there is a classification difference but Oanda is yet to answer on the matter.
The error does not occur when selecting Forex pairs.
If anyone had such issues in the past and managed to solve it I'll be happy to hear form them.
PS: I'm UK based and have access to most products including CFDs
class SMABollTrader(tpqoa.tpqoa):
def __init__(self, conf_file, instrument, bar_length, SMA, dev, SMA_S, SMA_L, units):
super().__init__(conf_file)
self.instrument = instrument
self.bar_length = pd.to_timedelta(bar_length)
self.tick_data = pd.DataFrame()
self.raw_data = None
self.data = None
self.last_bar = None
self.units = units
self.position = 0
self.profits = []
self.price = []
#*****************add strategy-specific attributes here******************
self.SMA = SMA
self.dev = dev
self.SMA_S = SMA_S
self.SMA_L = SMA_L
#************************************************************************
def get_most_recent(self, days = 5):
while True:
time.sleep(2)
now = datetime.utcnow()
now = now - timedelta(microseconds = now.microsecond)
past = now - timedelta(days = days)
df = self.get_history(instrument = self.instrument, start = past, end = now,
granularity = "S5", price = "M", localize = False).c.dropna().to_frame()
df.rename(columns = {"c":self.instrument}, inplace = True)
df = df.resample(self .bar_length, label = "right").last().dropna().iloc[:-1]
self.raw_data = df.copy()
self.last_bar = self.raw_data.index[-1]
if pd.to_datetime(datetime.utcnow()).tz_localize("UTC") - self.last_bar < self.bar_length:
break
def on_success(self, time, bid, ask):
print(self.ticks, end = " ")
recent_tick = pd.to_datetime(time)
df = pd.DataFrame({self.instrument:(ask + bid)/2},
index = [recent_tick])
self.tick_data = self.tick_data.append(df)
if recent_tick - self.last_bar > self.bar_length:
self.resample_and_join()
self.define_strategy()
self.execute_trades()
def resample_and_join(self):
self.raw_data = self.raw_data.append(self.tick_data.resample(self.bar_length,
label="right").last().ffill().iloc[:-1])
self.tick_data = self.tick_data.iloc[-1:]
self.last_bar = self.raw_data.index[-1]
def define_strategy(self): # "strategy-specific"
df = self.raw_data.copy()
#******************** define your strategy here ************************
df["SMA"] = df[self.instrument].rolling(self.SMA).mean()
df["Lower"] = df["SMA"] - df[self.instrument].rolling(self.SMA).std() * self.dev
df["Upper"] = df["SMA"] + df[self.instrument].rolling(self.SMA).std() * self.dev
df["distance"] = df[self.instrument] - df.SMA
df["SMA_S"] = df[self.instrument].rolling(self.SMA_S).mean()
df["SMA_L"] = df[self.instrument].rolling(self.SMA_L).mean()
df["position"] = np.where(df[self.instrument] < df.Lower) and np.where(df["SMA_S"] > df["SMA_L"] ,1,np.nan)
df["position"] = np.where(df[self.instrument] > df.Upper) and np.where(df["SMA_S"] < df["SMA_L"], -1, df["position"])
df["position"] = np.where(df.distance * df.distance.shift(1) < 0, 0, df["position"])
df["position"] = df.position.ffill().fillna(0)
self.data = df.copy()
#***********************************************************************
def execute_trades(self):
if self.data["position"].iloc[-1] == 1:
if self.position == 0 or None:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
elif self.position == -1:
order = self.create_order(self.instrument, self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING LONG")
self.position = 1
elif self.data["position"].iloc[-1] == -1:
if self.position == 0:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units * 2, suppress = True, ret = True)
self.report_trade(order, "GOING SHORT")
self.position = -1
elif self.data["position"].iloc[-1] == 0:
if self.position == -1:
order = self.create_order(self.instrument, self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
elif self.position == 1:
order = self.create_order(self.instrument, -self.units, suppress = True, ret = True)
self.report_trade(order, "GOING NEUTRAL")
self.position = 0
def report_trade(self, order, going):
time = order["time"]
units = order["units"]
price = order["price"]
pl = float(order["pl"])
self.profits.append(pl)
cumpl = sum(self.profits)
print("\n" + 100* "-")
print("{} | {}".format(time, going))
print("{} | units = {} | price = {} | P&L = {} | Cum P&L = {}".format(time, units, price, pl, cumpl))
print(100 * "-" + "\n")
trader = SMABollTrader("oanda.cfg", "EUR_GBP", "15m", SMA = 82, dev = 4, SMA_S = 38, SMA_L = 135, units = 100000)
trader.get_most_recent()
trader.stream_data(trader.instrument, stop = None )
if trader.position != 0: # if we have a final open position
close_order = trader.create_order(trader.instrument, units = -trader.position * trader.units,
suppress = True, ret = True)
trader.report_trade(close_order, "GOING NEUTRAL")
trader.signal = 0

I have done Hagmann course as well and I have recognised your code immediately.
Firstly the way you define your positions is not the best. Look at the section of combining two strategies. There are two ways.
Now regarding your price problem I had a similar situation with BTC. You can download it's historical data but when I plotted it to the strategy code and started to stream I had exactly the same error indicating that tick data was never streamed.
I am guessing that simply not all instruments are tradeable via api or in your case maybe you tried to stream beyond trading hours?

Apppend and Delete Rows to Grid with GridTableBase

I am having trouble appending and deleting rows. My table changes a lot and must be rebuilt often so this has been a little tricky. All of my information comes from an SQL database. I am loading the results into a pandas DataFrame and then using it to populate the GridTableBase class. I am now trying to Append and Delete rows, but am having trouble overriding the class. I have been able to somewhat get it to work, but it behaves weird. For some reason, self.table.AppendRows(row) doesn't work and throws an error. The original was self.table.AppendRow(row), but AppendRow isn't a method. So I had to use a different method. I have to change a value in order to get the GridTableMessage to realize there has been a change, which is what I am doing here data.iloc[data.shape[0]-1,0] = str(val)
Ideally, I would add/delete the row from the table itself, but I can't figure out how to do that. I have derived most of my code from here https://github.com/wxWidgets/Phoenix/blob/master/demo/Grid_MegaExample.py but a lot of that will not work properly for me.
As of now, I can append a row, but for some reason, it appends 2 even though only one has been added to the DataFrame and GetNumberRows is returning the correct count. I assume it has something to do with the way I am accessing the table class. Can anyone provide some clarity?
def rowPopup(self, row, evt):
"""(row, evt) -> display a popup menu when a row label is right clicked"""
appendID = wx.Window.NewControlId()#wx.NewId()
deleteID = wx.Window.NewControlId()#wx.NewId()
x = self.GetRowSize(row)/2
if not self.GetSelectedRows():
self.SelectRow(row)
menu = wx.Menu()
xo, yo = evt.GetPosition()
menu.Append(appendID, "Append Row")
menu.Append(deleteID, "Delete Row(s)")
def append(event, self=self, row=row):#event, self=self, row=row
global data
#print("Append")
#self.table.AppendRows(row)
dlg = wx.TextEntryDialog(self,'Enter a new Key ID to insert into the ' + str("'") + data.columns[0] + str("'") + ' column.', 'Insert New Record')
dlg.SetValue("")
if dlg.ShowModal() == wx.ID_OK:
#print('You entered: %s\n' % dlg.GetValue())
val = dlg.GetValue()
#data[~pd.isnull(data).all(1)].fillna('')
#data['tables_id'].apply('(g)'.format)
data.loc[data.iloc[-1].name + 1,:] = ""
data.iloc[data.shape[0]-1,0] = str(val)
self.Reset()
#print(data)
#data = data.append(pd.Series(dtype='object'), ignore_index=True)
#self.data = DataTable(data)
#data[~pd.isnull(data).all(1)].fillna('')
#self.data = DataTable(data)
def delete(event, self=self, row=row):#event, self=self, row=row
global data
rows = self.GetSelectedRows()
data.drop(data.index[rows],inplace=True)
print (data)
self.Reset()
#self.table.DeleteRow(row)
#print(row)
#print(rows)
#EVT_MENU(self, appendID, append)
#EVT_MENU(self, deleteID, delete)
self.Bind(wx.EVT_MENU, append, id=appendID)
self.Bind(wx.EVT_MENU, delete, id=deleteID)
self.PopupMenu(menu, wx.Point(round(x), round(yo)))
menu.Destroy()
class DataTable(gridlib.GridTableBase):
def __init__(self, data):
gridlib.GridTableBase.__init__(self)
self.headerRows = 1
if data is None:
data = pd.DataFrame()
self.data = data
print("Instance")
#Store the row and col length to see if table has changed in size
self._rows = self.GetNumberRows()
self._cols = self.GetNumberCols()
self.odd=gridlib.GridCellAttr()
self.odd.SetBackgroundColour((217,217,217))
self.even=gridlib.GridCellAttr()
self.even.SetBackgroundColour((255,255,255))
def GetAttr(self, row, col, kind):
attr = [self.even, self.odd][row % 2]
attr.IncRef()
return attr
def GetNumberRows(self):
#print("# Rows:",len(self.data))
return len(self.data)# - 1
def GetTypeName(self, row, col):
#print(wx.grid.GRID_VALUE_STRING)
return wx.grid.GRID_VALUE_STRING
def GetNumberCols(self):
#print("# Cols:",len(self.data.columns)+ 1)
return len(self.data.columns) + 1
#return len(self.data.columns) #+ 1
def IsEmptyCell(self, row, col):
return False
def GetValue(self, row, col):
if col == 0:
try:
return self.data.index[row]
except:
print("Row,Col(",row,col,")","OOB")
return ""
else:
try:
return str(self.data.iloc[row, col - 1])
except:
print("Row,Col(",row,col,")","OOB")
return ""
def GetColLabelValue(self, col):
if col == 0:
if self.data.index.name is None:
return 'Index'
else:
return self.data.index.name
return self.data.columns[col - 1]
def ResetView(self, grid):
"""
(wxGrid) -> Reset the grid view. Call this to
update the grid if rows and columns have been added or deleted
"""
print('Old::' , self._rows, self._cols)
print('New::' , self.GetNumberRows(),self.GetNumberCols())
print(data)
grid.BeginBatch()
for current, new, delmsg, addmsg in [
(self._rows, self.GetNumberRows(), gridlib.GRIDTABLE_NOTIFY_ROWS_DELETED, gridlib.GRIDTABLE_NOTIFY_ROWS_APPENDED),
(self._cols, self.GetNumberCols(), gridlib.GRIDTABLE_NOTIFY_COLS_DELETED, gridlib.GRIDTABLE_NOTIFY_COLS_APPENDED),
]:
if new < current:
msg = gridlib.GridTableMessage(self,delmsg,new,current-new)
#grid.ProcessTableMessage(msg)
self.GetView().ProcessTableMessage(msg)
print("OvN:",self._rows,self.GetNumberRows())
return True
if new > current:
msg = gridlib.GridTableMessage(self,addmsg,new-current)
self.GetView().ProcessTableMessage(msg)
grid.ProcessTableMessage(msg)
#self.UpdateValues(grid)
msg = gridlib.GridTableMessage(self, gridlib.GRIDTABLE_REQUEST_VIEW_GET_VALUES)
grid.ProcessTableMessage(msg)
print("OvN:",self._rows,self.GetNumberRows())
grid.EndBatch()
self._rows = self.GetNumberRows()
self._cols = self.GetNumberCols()
# update the column rendering plugins
#self._updateColAttrs(grid)
# XXX
# Okay, this is really stupid, we need to "jiggle" the size
# to get the scrollbars to recalibrate when the underlying
# grid changes.
h,w = grid.GetSize()
grid.SetSize((h+1, w))
grid.SetSize((h, w))
grid.ForceRefresh()
def UpdateValues(self, grid):#self, grid
"""Update all displayed values"""
# This sends an event to the grid table to update all of the values
msg = gridlib.GridTableMessage(self, gridlib.GRIDTABLE_REQUEST_VIEW_GET_VALUES)
grid.table.ProcessTableMessage(msg)
class DataGrid(gridlib.Grid):
def __init__(self, parent, data, lc, tc): # data
gridlib.Grid.__init__(self, parent, - 1) #,colnames,-1 # data
self.lc = lc
self.tc = tc
self.table = DataTable(data)
self.SetTable(self.table, True)
self.Bind(gridlib.EVT_GRID_LABEL_RIGHT_CLICK, self.OnLabelRightClicked)
self.Bind(gridlib.EVT_GRID_CELL_RIGHT_CLICK, self.OnCellRightClick)
self.Bind(gridlib.EVT_GRID_CELL_CHANGED, self.onCellChanged) #wx.grid
def Reset(self):
"""reset the view based on the data in the table. Call
this when rows are added or destroyed"""
self.table.ResetView(self)
def OnCellRightClick(self, event):
print ("OnCellRightClick: (%d,%d)\n" % (event.GetRow(), event.GetCol()))
def OnLabelRightClicked(self, evt):
row, col = evt.GetRow(), evt.GetCol()
if row == -1: print("col")#self.colPopup(col, evt)
elif col == -1: self.rowPopup(row, evt)
def rowPopup(self, row, evt):
"""(row, evt) -> display a popup menu when a row label is right clicked"""
appendID = wx.Window.NewControlId()#wx.NewId()
deleteID = wx.Window.NewControlId()#wx.NewId()
x = self.GetRowSize(row)/2
if not self.GetSelectedRows():
self.SelectRow(row)
menu = wx.Menu()
xo, yo = evt.GetPosition()
menu.Append(appendID, "Append Row")
menu.Append(deleteID, "Delete Row(s)")
def append(event, self=self, row=row):#event, self=self, row=row
global data
#print("Append")
#self.table.AppendRows(row)
dlg = wx.TextEntryDialog(self,'Enter a new Key ID to insert into the ' + str("'") + data.columns[0] + str("'") + ' column.', 'Insert New Record')
dlg.SetValue("")
if dlg.ShowModal() == wx.ID_OK:
val = dlg.GetValue()
#data[~pd.isnull(data).all(1)].fillna('')
#data['tables_id'].apply('(g)'.format)
data.loc[data.iloc[-1].name + 1,:] = ""
data.iloc[data.shape[0]-1,0] = str(val)
self.Reset()
#print(data)
#self.data = DataTable(data)
def delete(event, self=self, row=row):#event, self=self, row=row
global data
rows = self.GetSelectedRows()
data.drop(data.index[rows],inplace=True)
print (data)
self.Reset()
self.Bind(wx.EVT_MENU, append, id=appendID)
self.Bind(wx.EVT_MENU, delete, id=deleteID)
self.PopupMenu(menu, wx.Point(round(x), round(yo)))
menu.Destroy()
class MainFrame(wx.Frame):
def __init__(self, parent, data): # (self, parent, data):
wx.Frame.__init__(self, parent, -1, "Varkey Foundation") #, size=(640,480))
#Create a panel
self.p = wx.Panel(self)
self.Maximize(True)
#Create blank dataframe
data = pd.DataFrame() #pd.DataFrame(np.random.randint(0,100,size=(200, 5)),columns=list('EFGHD')
#data.reset_index(drop=True, inplace=True)
self.data = DataTable(data)
self.nb = wx.Notebook(self.p)
self.p.SetBackgroundColour( wx.Colour( 0, 0, 0 ) ) # 38,38,38
self.nb.SetBackgroundColour(wx.Colour(58, 56, 56) )
#self.SetBackgroundColour( wx.Colour( 255, 255, 56 ) )
#create the page windows as children of the notebook
self.page1 = PageOne(self.nb)
self.page2 = PageTwo(self.nb)
self.page3 = PageThree(self.nb)
# add the pages to the notebook with the label to show on the tab
self.nb.AddPage(self.page1, "Data")
self.nb.AddPage(self.page2, "Analyze")
self.nb.AddPage(self.page3, "Change Log")
#CreateFonts
self.b_font = wx.Font(14,wx.ROMAN,wx.NORMAL,wx.BOLD, True)
self.lbl_font = wx.Font(14,wx.ROMAN,wx.NORMAL,wx.NORMAL, True)
self.cb_font = wx.Font(11,wx.SCRIPT,wx.ITALIC,wx.NORMAL, True)
self.h_font = wx.Font(18,wx.DECORATIVE,wx.ITALIC,wx.BOLD, True)
#Create username textcontrol <<<<<<<<<<<< Passed to grid class
self.tc_user =wx.TextCtrl(self.p,value='cmccall95',size = (130,25))
self.tc_password =wx.TextCtrl(self.p,value='Achilles95', style=wx.TE_PASSWORD | wx.TE_PROCESS_ENTER,size = (130,25))
self.tc_password.Bind(wx.EVT_TEXT_ENTER,self.onLogin)
self.tc_user.SetFont(self.cb_font)
self.tc_password.SetFont(self.cb_font)
#Create Change log lstCtrl <<<<<<<<<<<< Passed to grid class
self.lc_change = wx.ListCtrl(self.p,-1,style = wx.TE_MULTILINE | wx.LC_REPORT | wx.LC_VRULES)
self.lc_change.InsertColumn(0,"User ID")
self.lc_change.InsertColumn(1,"Status")
self.lc_change.InsertColumn(2,"Description")
self.lc_change.InsertColumn(3,"Date/Time")
#Set column widths
self.lc_change.SetColumnWidth(0, 75)
self.lc_change.SetColumnWidth(1, 75)
self.lc_change.SetColumnWidth(2, 450)
self.lc_change.SetColumnWidth(3, 125)
#Create the grid and continue layout
self.grid = DataGrid(self.page1, data, self.lc_change, self.tc_user)
#More layout code...
def onLoadNewData(self, event): #This is how I'm replacing the data in my table class
global data
self.Freeze()
if self.combo_table.GetValue():
#Connect to db
self.connect_mysql()
#Determine db table
self.getTable()
#Get new data
sql_query = "SELECT * FROM " + tbl
self.cursor.execute(sql_query)
temp = pd.read_sql(sql_query, con=self.db_con)
temp.reset_index(drop=True, inplace=True)
data = temp[~pd.isnull(temp).all(1)].fillna('')
#Create title #if data:
if not data.empty:
self.title.SetLabel(str(self.combo_table.GetValue()))
print(str(self.combo_table.GetValue()))
self.grid.Destroy()
self.grid = DataGrid(self.page1, data, self.lc_change, self.tc_user)
#self.grid.HideCol(0)
self.grid.AutoSizeColumns()
#Insert grid into existing sizer
self.p1_sizer.Insert(1,self.grid,1,wx.RIGHT| wx.LEFT|wx.EXPAND, 20)
self.p1_sizer.Layout()
#RESIZE
else:
print("Error:Dataframe is empty")
self.close_connection()
else:
print('CANT BE BLANK')
self.Thaw()
if __name__ == '__main__':
import sys
app = wx.App()
frame = MainFrame(None, sys.stdout) # (None, sys.stdout)
frame.Show(True)
app.MainLoop()

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Getting ' object is not iterable Error in my code

I'm getting "Type object is not iterable" error while running this code of my project in EucledianScore's for loop ->j in train_user..This code is for user-user collaborative filtering using the dataset from MovieLens.I've seen many "Type object is not iterable" problem's solution but those are not working for my code.
def user_collabo():
def EucledianScore(train_user, test_user):
sum = 0
count = 0
for i in test_user:
score = 0
for j in train_user:
if(int(i[1]) == int(j[1])):
score= ((float(i[2])-float(j[2]))*(float(i[2])-float(j[2])))
count= count + 1
sum = sum + score
if(count<4):
sum = 1000000
return(math.sqrt(sum))
data_cols=['user id','movie id','rating','timestamp']
item_cols=['movie id','movie title','release date',
'video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime',
'Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller',
'War' ,'Western']
user_cols = ['user id','age','gender','occupation',
'zip code']
users = pd.read_csv('u.user', sep='|', names=user_cols, encoding='latin-1')
item = pd.read_csv('u.item', sep='|', names=item_cols, encoding='latin-1')
data = pd.read_csv('u.data', sep='\t', names=data_cols, encoding='latin-1')
utrain = (data.sort_values('user id'))[:99832]
print(utrain.tail())
utest = (data.sort_values('user id'))[99833:]
print(utest.head())
utrain = utrain.as_matrix(columns = ['user id', 'movie id', 'rating'])
utest = utest.as_matrix(columns = ['user id', 'movie id', 'rating'])
users_list = []
for i in range(1,943):
list1 = []
for j in range(0,len(utrain)):
if utrain[j][0] == i:
list1.append(utrain[j])
else:
break
utrain = utrain[j:]
users_list.append(list1)
score_list = []
for i in range(0,942):
score_list.append([i+1,EucledianScore(users_list[i], utest)])
score = pd.DataFrame(score_list, columns = ['user id','Eucledian Score'])
score = score.sort_values(by = 'Eucledian Score')
#print(score)
score_matrix = score.as_matrix()
user= int(score_matrix[0][0])
common_list = []
full_list = []
for i in utest:
for j in users_list[user-1]:
if(int(i[1])== int(j[1])):
common_list.append(int(j[1]))
full_list.append(j[1])
common_list = set(common_list)
full_list = set(full_list)
recommendation = full_list.difference(common_list)
item_list = (((pd.merge(item,data).sort_values(by = 'movie id')).groupby('movie title')))['movie id', 'movie title', 'rating']
item_list = item_list.mean()
item_list['movie title'] = item_list.index
item_list = item_list.as_matrix()
recommendation_list = []
for i in recommendation:
recommendation_list.append(item_list[i-1])
recommendation = (pd.DataFrame(recommendation_list,columns = ['movie id','mean rating' ,'movie title'])).sort_values(by = 'mean rating', ascending = False)
print(recommendation[['mean rating','movie title']])
user_collabo()
Output:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-20-24f9c9703462> in <module>
71 recommendation = (pd.DataFrame(recommendation_list,columns = ['movie id','mean rating' ,'movie title'])).sort_values(by = 'mean rating', ascending = False)
72 print(recommendation[['mean rating','movie title']])
---> 73 user_collabo()
<ipython-input-20-24f9c9703462> in user_collabo()
43 score_list = []
44 for i in range(0,942):
---> 45 score_list.append([i+1,EucledianScore(users_list[i], utest)])
46
47 score = pd.DataFrame(score_list, columns = ['user id','Eucledian Score'])
<ipython-input-20-24f9c9703462> in EucledianScore(train_users, test_user)
5 for i in test_user:
6 score = 0
----> 7 for j in train_users:
8 if(int(i[1]) == int(j[1])):
9 score= ((float(i[2])-float(j[2]))*(float(i[2])-float(j[2])))
TypeError: 'type' object is not iterable

The error comes from this line:
users_list.append(list)
This appends the python builtin type list. I think you meant users_list.append(list1). This will build up users_list into a list of lists.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

WebDriverException: When trying to scrap amazon for product title and price using Selenium - selenium

Related

Selenium Issue with Exec?

Oanda API - Issue Price - Instruments

Apppend and Delete Rows to Grid with GridTableBase

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

Getting ' object is not iterable Error in my code

Categories

Resources