Scrapy CrawlSpider - Yielding multiple items from a single spider

Scrapy CrawlSpider - Yielding multiple items from a single spider - scrapy

i'm very new either with Scrapy or Python so my vocabulary might be inacurate
I'm trying to get two different items with my CrawlSpider, but i cannot find out how to do it. Currently only the first item "CourseItem" is crawled, the other one is not executed at all.
I believe it's probably the way im calling everything but can't figure exactly what. (callback=parse_course) ?
Each function is working properly if i make one spider per item.
I've tried to put everything in the same function but it doesn't help either.
spider.py
# -*- coding: utf-8 -*-
import scrapy
import re
import datetime
from scrapy.http import Request
from scrapy_spider.items import CourseItem, PronosticItem
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
class GenycourseSpider(scrapy.spiders.CrawlSpider):
name = 'genycourse'
allowed_domains = ['www.geny.com']
rules = (
Rule(
LinkExtractor(allow='/partants-pmu/*'), callback="parse_course"
),
)
def start_requests(self):
date = datetime.date(2016, 03, 12)
datefin = datetime.date(2016, 03, 12)
while date <= datefin:
url = 'http://www.geny.com/reunions-courses-pmu?date=%s' % date.isoformat()
date = date + datetime.timedelta(days=1)
yield Request(url)
def parse_course(self, response):
#Recupération des valeurs
date = response.request.url
reunion = response.xpath("//*[#id='yui-main']/div/div[2]/div[1]/text()").extract_first().strip()
course = response.xpath("//*[#id='yui-main']/div/div[2]/div[3]/span/strong/text()[1]").extract_first().strip()
type = response.xpath("//*[#id='yui-main']/div/div[2]/span").extract_first().strip()
hippodrome = response.xpath("//*[#id='navigation']/a[3]/text()").extract_first()
#Nettoyage du resultat
#On isole la date
date = date[33:43]
#On coupe au niveau de ( et on garde le numéro de la réunion
temp = re.split(r"\(", reunion)
reunion = temp [1]
reunion = re.sub(r'[R\)]', '', reunion)
#On identifie les courses et les stocke
type = re.findall(r'(Attel|Plat|Haies|Steeple-chase|Cross|Mont)', type)
plat = "Plat" in type
obstacle = "Haies" or "Steeple-chase" in type
#On met en place l'objet
courseinfo = CourseItem()
courseinfo['date'] = date
courseinfo['reunion'] = reunion
courseinfo['course'] = course
courseinfo['type'] = type
courseinfo['hippodrome'] = hippodrome
loopcheval = 1
numerocheval = 1
while numerocheval != None:
numerocheval = response.xpath("//*[#id='tableau_partants']/tbody/tr["+ str(loopcheval) +"]/td[1]/text()").extract_first()
if plat is True:
cotecheval = response.xpath("//*[#id='tableau_partants']/tbody/tr["+ str(loopcheval) +"]/td[12]/text()").extract_first().strip()
elif obstacle is True:
cotecheval = response.xpath("//*[#id='tableau_partants']/tbody/tr["+ str(loopcheval) +"]/td[11]/text()").extract_first().strip()
else:
cotecheval = response.xpath("//*[#id='tableau_partants']/tbody/tr["+ str(loopcheval) +"]/td[10]/text()").extract_first().strip()
if numerocheval is not None:
courseinfo['numero'] = numerocheval
courseinfo['cote'] = cotecheval
yield courseinfo
loopcheval = loopcheval + 1
def parse_prono(self, response):
date = response.request.url
reunion = response.xpath("//*[#id='yui-main']/div/div[2]/div[1]/text()").extract_first().strip()
course = response.xpath("//*[#id='yui-main']/div/div[2]/div[3]/span/strong/text()[1]").extract_first().strip()
date = date[33:43]
temp = re.split(r"\(", reunion)
reunion = temp [1]
reunion = re.sub(r'[R\)]', '', reunion)
pronoinfo = PronosticItem()
pronoinfo['date'] = date
pronoinfo['reunion'] = reunion
pronoinfo['course'] = course
ligne = 1
testpronostique = pronostiqueur = response.xpath("//*[#id='selectionsPresse']/table/tr[1]/td[1]/div/div[1]").extract_first()
if testpronostique is None:
pronoinfo['pronostiqueur'] = 'Pas de pronostic'
pronoinfo['premier'] = 0
pronoinfo['second'] = 0
yield pronoinfo
else:
while ligne <= 3:
while ligne <= 2:
colonne = 1
while colonne <= 3:
pronostiqueur = response.xpath("//*[#id='selectionsPresse']/table/tr["+ str(ligne) +"]/td["+ str(colonne) +"]/div/div[1]").extract_first()
pronostiqueur = re.findall(r'(Radio|AIP|Sud|Presse|Casaque|Courrier|che)', pronostiqueur)
pronostique = response.xpath("//*[#id='selectionsPresse']/table/tr["+ str(ligne) +"]/td["+ str(colonne) +"]/div/div[2]/text()").extract_first().strip()
pronostique = re.split(r" - ", pronostique)
premier = pronostique [0]
second = pronostique [1]
pronoinfo['pronostiqueur'] = pronostiqueur
pronoinfo['premier'] = premier
pronoinfo['second'] = second
yield pronoinfo
colonne = colonne + 1
if colonne == 4:
ligne = ligne + 1
if ligne == 3:
pronostiqueur = response.xpath("//*[#id='selectionsPresse']/table/tr[3]/td[1]/div/div[1]").extract_first()
pronostiqueur = re.findall(r'(Radio|AIP|Sud|Presse|Casaque|Courrier|che)', pronostiqueur)
pronostique = response.xpath("//*[#id='selectionsPresse']/table/tr[3]/td[1]/div/div[2]/text()").extract_first().strip()
pronostique = re.split(r" - ", pronostique)
premier = pronostique [0]
second = pronostique [1]
pronoinfo['pronostiqueur'] = pronostiqueur
pronoinfo['premier'] = premier
pronoinfo['second'] = second
yield pronoinfo
ligne = ligne + 1
items.py
class CourseItem(scrapy.Item):
date = scrapy.Field()
reunion = scrapy.Field()
course = scrapy.Field()
type = scrapy.Field()
hippodrome = scrapy.Field()
numero = scrapy.Field()
cote = scrapy.Field()
class PronosticItem(scrapy.Item):
date = scrapy.Field()
reunion = scrapy.Field()
course = scrapy.Field()
pronostiqueur = scrapy.Field()
premier = scrapy.Field()
second = scrapy.Field()
Im expecting to get both CourseItem() and PronosticItem() to get populated and sent to pipeplines and not just only CourseItem()
Thanks for your help !

Related

Selenium Issue with Exec?

I am running a webscraper with selenium to get some data on the NBA. I have urls to get to the websites for each of the 30 teams, but when I run the code it only gets through a few of the urls and then crashes with the errors below being shown:
#web scraper
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import pandas as pd
import os
class NBAScraper:
def __init__(self):
#part 1
url = "https://www.nba.com/teams"
HTML = requests.get(url)
soup = BeautifulSoup(HTML.text, 'html.parser')
text = str(soup.find_all("a", "Anchor_anchor__cSc3P TeamFigureLink_teamFigureLink__uqnNO"))
ids = []
for i in range(0, 30):
hr = text.find("stats")
ids.append(text[(hr+11):(hr+21)])
text = text[(hr+22):]
#part 2
names = []
for j in range(0, 30):
url2 = "https://www.nba.com/stats/team/"+str(ids[j])+"/advanced"
HTML2 = requests.get(url2)
soup2 = BeautifulSoup(HTML2.text, 'html.parser')
##div class="TeamHeader_name__MmHlP
name = str(soup2.find("div", "TeamHeader_name__MmHlP"))
ni = name.find("div>")
ni2 = name.find("<!")
name1 = name[(ni+4):ni2]
name = name[ni2:]
ni3 = name.find("<div>")
name = name[(ni3+5):]
ni4 = name.find("</div>")
name2 = name[:ni4]
n = name1 + " " + name2
names.append(n)
##tbody class="Crom_body__UYOcU"
#part 3
offrtg = []
defrtg = []
reb = []
tov = []
efg = []
for k in range(0, 30):
self.driver = webdriver.Chrome()
url3 = "https://www.nba.com/stats/team/"+str(ids[k])+"/advanced"
self.driver.get(url3)
rndrhtml = self.driver.page_source
self.driver.close()
#self.driver.quit()
soup3 = BeautifulSoup(rndrhtml, 'html.parser')
ovrall = str(soup3.find("tbody", "Crom_body__UYOcU").find_all("td"))
for d in range(0, 13):
di = ovrall.find("<td>")
ovrall = ovrall[(di+4):]
#conditions
if d == 2:
di2 = ovrall.find("</td>")
offrtg.append(float(ovrall[:di2]))
elif d == 3:
di2 = ovrall.find("</td>")
defrtg.append(float(ovrall[:di2]))
elif d == 10:
di2 = ovrall.find("</td>")
reb.append(float(ovrall[:di2]))
elif d == 11:
di2 = ovrall.find("</td>")
tov.append(float(ovrall[:di2]))
elif d == 12:
di2 = ovrall.find("</td>")
efg.append(float(ovrall[:di2]))
#writing to excel
os.remove(r"C:\Users\jackm\OneDrive\Desktop\NBA\NBASTATS.xlsx")
d = {'Name': names, 'OFFRTG': offrtg, 'DEFRTG': defrtg, 'REB': reb,
'TOV': tov, 'EFG': efg}
df = pd.DataFrame(data=d)
df.to_excel(r"C:\Users\jackm\OneDrive\Desktop\NBA\NBASTATS.xlsx", sheet_name="STATS")
NBAScraper()
I tried to play around with the closing and quitting functions for the driver, or put the driver in a separate function and run it outside the class, but none of that worked. I realized through some testing that even if it's not inside a loop, selenium will throw the error for a url but run it fine the second time. I tried using implicit waits to solve this but to no avail.
Traceback (most recent call last):
File "C:\Program Files\Spyder\pkgs\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\users\jackm\spyder\nba.py", line 104, in <module>
NBAScraper()
File "c:\users\jackm\spyder\nba.py", line 71, in __init__
ovrall = str(soup3.find("tbody", "Crom_body__UYOcU").find_all("td"))
AttributeError: 'NoneType' object has no attribute 'find_all'

WebDriverException: When trying to scrap amazon for product title and price using Selenium

I'm attempting to scrape Amazon for iPhone 11 names and prices, but when I run the code, I get the following error:
The Error I get:
My code is as the following:
```
#First project
class CrawledInfo:
def __init__(self, product_name, product_price, cust_name = None, cust_location = None, rating = None, review = None, review_date = None)-> None:
self.cust_name = cust_name
self.cust_location = cust_location
self.product_name = product_name
self.product_price = product_price
self.rating = rating
self.review = review
self.review_date = review_date
class CrawlerBot:
def item(self, name):
count = 1
page = 1
pageIncrement = 1
maxRetrieves = 100
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
l = []
#Declaring options
options = Options()
options.headless = False
options.add_experimental_option('detach', True)
browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
browser.maximize_window()
browser.get(url)
browser.set_page_load_timeout(10)
while True:
try:
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Capture item name
xPathTitle = '//*[#id="search"]/div[1]/div[2]/div/span[3]/div[2]/div[' + str(count) + ']/div/span/div/div/div[2]/div[2]/div/div[1]/div/div/div[1]/h2/a/span'
title = browser.find_element_by_xpath(xPathTitle)
titleText = title.get_attribute('innerHTML').splitLines()[0]
title.click()
#Capture item price
xPathPrice = '//*[#id="price_inside_buybox"]'
price = browser.find_element_by_xpath(xPathPrice)
priceText = price.get_attribute('innerHTML').splitLines()
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
#Send the results to class CrawledInfo
info = CrawledInfo(titleText, priceText)
l.append(info)
count +=1
except Exception as e:
print('Exception: ', e)
count +=1
if pageIncrement * page > maxRetrieves:
break
if count > pageIncrement:
page +=1
count = 1
#Return to the search page
url = 'https://www.amazon.co.uk/s?k='+ name + '&page=' + str(page)
browser.get(url)
browser.set_page_load_timeout(10)
browser.close()
return l
#Creating the object
start_crawler = CrawlerBot()
with open('results', 'w', newline='', encoding='utf-8') as fileWriter:
dataWriter = csv.writer(fileWriter, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for dat in start_crawler.item('iphone 11'):
dataWriter.writerow([dat.product_name, dat.product_price])
```
Anyone who has an idea of what's wrong?
When my code is working write I'm expecting it to create a csv file with the names of iPhone 11 together with their prices

Multithread and AttributeError: 'NoneType' object has no attribute 'groups'

We wrote this code in order to plot the data conteined in a txt file:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import re
import numpy as np
import os
names = ['CH','LG','HG','Ts(ns)','ToT(ns)']
righe_primo_header = 5
righe_header = 5
canali = 64
# input file
infile = 'Run1_list.txt'
# determinare numero di righe, poi di eventi nel file di input
stream = os.popen('wc -l '+ infile)
nrighe = stream.read()
match = re.match(r" (\S+)\s*", nrighe, re.I)
items = match.groups()
nrighe = float(items[0])
#print( 'nrighe = ',nrighe)
# numero di blocchi di dati da leggere
ntrigger = (nrighe - righe_primo_header) / (canali + righe_header) - 1
ntrigger = int( ntrigger)
print('trovati ',ntrigger,' eventi')
ncanali_histo = int(np.sqrt(ntrigger))
ncanali_histo = 4096
events = []
file1 = open( infile, 'r')
for line in range(righe_primo_header-1):
line = file1.readline()
#print('saltiamo riga ', line)
line=file1.readline()
for trigger in range(ntrigger):
#while line:
for lineh in range(righe_header):
line = file1.readline()
#print('saltiamo ',line)
for canale in range(canali):
line = file1.readline()
#print(' elaboriamo ',line)
match = re.match(r"(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+", line, re.I)
temparray = []
if match:
items = match.groups()
#print(items)
for colonna in range(len(items)):
col = items[colonna]
if col == '-':
valore = 0
else:
valore = float(items[colonna])
temparray.append( valore )
#print('blocco ', trigger, ' colonna ', colonna, ' ', items[colonna],' -> ',valore)
#print('temparray = ',temparray)
events.append(temparray)
file1.close()
print('ultimo trigger ID letto: ', trigger)
#print('events = ',events)
df = pd.DataFrame( events, columns = names)
print(df)
# istogramma di HG per canale fissato
canale = 44
plot_df = df.loc[ df['CH'] == canale ]
print('plot_df per istogramma:')
print(plot_df)
plot_df.hist(column='HG', bins=ncanali_histo)
plt.title('Multiphoton spectrum HG channel ' + str(canale) )
# seleziona un evento
evento = 3
plot_df = df[ (canali * evento):(canali*evento + canali) ]
print('plot_df per scatter plot:')
print(plot_df)
plot_df.plot.scatter(x='CH', y='HG', c='red')
plt.title('HG vs CH event ' + str(evento) )
plt.show()
This code perfectly works in MacOs but not in Linux and Windows (of course becouse we dont use wc command, no problem) and we get the following error:
Traceback (most recent call last):
File "Read_list.py", line 20, in <module>
items = match.groups()
AttributeError: 'NoneType' object has no attribute 'groups'
Why this errors happens?
Then, the txt file is of the order of GB, how can i run the code using the multithread? Can you help me?
I upload a small example of data here (see raw): https://pastebin.com/raw/PjVYc3vn

I resolve the firt issue:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import re
import numpy as np
import os
names = ['CH','LG','HG','Ts(ns)','ToT(ns)']
righe_primo_header = 5
righe_header = 5
canali = 64
# input file
infile = 'Run1_list.txt'
# determinare numero di righe, poi di eventi nel file di input
stream = os.popen('wc -l '+ infile)
nrighe = stream.read()
#print( 'nrighe = ',nrighe)
match = re.match(r"\s*(\S+) (.*)", nrighe, re.I)
#print( match)
items = match.groups()
nrighe = float(items[0])
#print( 'nrighe = ',nrighe)
# numero di blocchi di dati da leggere
ntrigger = (nrighe - righe_primo_header) / (canali + righe_header) - 1
ntrigger = int( ntrigger)
print('trovati ',ntrigger,' eventi')
ncanali_histo = int(np.sqrt(ntrigger))
ncanali_histo = 4096
events = []
file1 = open( infile, 'r')
for line in range(righe_primo_header-1):
line = file1.readline()
#print('saltiamo riga ', line)
line=file1.readline()
for trigger in range(ntrigger):
#while line:
for lineh in range(righe_header):
line = file1.readline()
#print('saltiamo ',line)
for canale in range(canali):
line = file1.readline()
#print(' elaboriamo ',line)
match = re.match(r"(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+", line, re.I)
temparray = []
if match:
items = match.groups()
#print(items)
for colonna in range(len(items)):
col = items[colonna]
if col == '-':
valore = 0
else:
valore = float(items[colonna])
temparray.append( valore )
#print('blocco ', trigger, ' colonna ', colonna, ' ', items[colonna],' -> ',valore)
#print('temparray = ',temparray)
events.append(temparray)
file1.close()
print('ultimo trigger ID letto: ', trigger)
#print('events = ',events)
df = pd.DataFrame( events, columns = names)
print('Il dataframe totale è il seguente: ', df)
# istogramma di HG per canale fissato
canale = 44
plot_df = df.loc[ (df['CH'] == canale) | (df['CH'] == 50)]
print('Il dataframe selezionato è il seguente: ', plot_df)
pd.options.mode.chained_assignment = None # default='warn'
plot_df['HG'][df['CH']==44] *= (1.096)
fig = px.histogram(plot_df, x='HG', color='CH', barmode='overlay', opacity=0.8, title='Multiphoton spectrum HG channel')
fig.update_traces(xbins=dict(start=0.0, end=4096.0, size=1))
fig.show()
# seleziona un evento
evento = 3
plot_df2 = df[(canali * evento):(canali*evento + canali)]
print('Il dataframe per lo scatter plot HG vs Ch relativo all evento ',evento, 'è il seguente: ', plot_df2)
fig2 = px.scatter(plot_df2, x='CH', y='HG', title='HG vs CH event ' + str(evento) )
fig2.show()
Now, how can i compile it in multithread?

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Odoo computed fields: result in unsupported operand type(s)

In Odoo 8 i have a computed field with a function which result in a error.
Can't seem to make things work and need some help.
My test code:
from openerp import models, fields, api, _
from openerp.tools.translate import _
import openerp.addons.decimal_precision as dp
class plano(models.Model):
_name = 'plano'
_description = "variabelen plaat toeslagen, rillen kenmerken."
name = fields.Char('Plano naam', required=True)
constructie_id = fields.Char('Fefco constructie')
testB_p1 = fields.Char('Breedte P1', help = 'Tekst veld P1 (variabele breedte P1)')
toeslagB_p1 = fields.Float('toeslag breedte P1 (variabel Breedte P1)', digits=(3, 1))
testL_p1 = fields.Char('Lengte P1', help = 'Tekst veld P1 (variabele lengte P1)')
toeslagL_p1 = fields.Float('toeslag lengte P1 (variabel lengte P1)', digits=(3, 1))
Kw = fields.Float('Kwaliteit dikte in mm', digits=(3, 0), help = "Wordt uit gerelateerd veld van model Quality gehaald.")
class calc(models.Model):
#api.depends('name')
def _functionb_p1(self):
val1 = 0.0
if plano.testB_p1 != 'H+':
val1 = calc.hoogte + (2.0 * plano.Kw) + 2.0
elif plano.testB_p1 != 'B':
val1 = calc.breedte + (plano.toeslagB_p1 * plano.Kw)
return val1
_name = "calc"
_description = "kostprijs berekening."
name = fields.Many2one('plano', help = "Kostprijs berekening nummer e.g. C1234")
lengte = fields.Float('Lengte in mm', digits=(4, 0), help = "Lengte in mm")
breedte = fields.Float('Breedte in mm', digits=(4, 0))
hoogte = fields.Float('Hoogte in mm', digits=(4, 0))
aantal = fields.Float('Aantal stuks', digits=(4, 0))
planob_p1 = fields.Float('Plano Breedte P1')
planobt_p1 = fields.Float('Plano Breedte toeslag P1')
val1 = fields.Float(compute = '_functionb_p1', store=True,
string = 'Aanmaak Plano breedte P1',
help = "Berekening vanuit functie _functionb_p1")
ERROR:
File "....8.0\test\models\calc.py", line 47, in _functionb_p1
val1 = calc.hoogte + (2.0 * plano.Kw) + 2.0
TypeError: unsupported operand type(s) for *: 'float' and 'Float'

The TypeError is very weird. I never had such problems with Odoo Float...
But some hints to your computing function. It should use #api.multi or #api.one (second is deprecated) as additional decorator.
And then you should have a connection between plano and calc models. Your model relation (there is no one) doesn't allow the calculation you're looking for, because you need a plano instance/record and a calc instance/record.
You're computing your value on calc model, so i will try to get you the right method, with one condition: there is a Many2One field on calc model named plano_id.
#api.multi
#api.depends('name')
def _functionb_p1(self):
for calc in self:
val1 = 0.0
plano = calc.plano_id
if plano.testB_p1 != 'H+':
val1 = calc.hoogte + (2.0 * plano.Kw) + 2.0
elif calc.plano_id.testB_p1 != 'B':
val1 = calc.breedte + (plano.toeslagB_p1 * plano.Kw)
calc.val1 = val1
# my condition!
plano_id = fields.Many2One(comodel_name="plano", string="Plano")

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Scrapy CrawlSpider - Yielding multiple items from a single spider - scrapy

Related

Selenium Issue with Exec?

WebDriverException: When trying to scrap amazon for product title and price using Selenium

Multithread and AttributeError: 'NoneType' object has no attribute 'groups'

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

Odoo computed fields: result in unsupported operand type(s)

Categories

Resources