Adding fields to item in scrapy from list - scrapy

I have a long list of fields to add to a scrapy item but can't figure out how to do it. Basically I am trying to get the output of
class myItem(scrapy.Item):
a = scrapy.field()
b = scrapy.field()
But I want to create it from a list like
['a','b']
I tried
fields = ['a','b']
def generateFields(fields):
item = myItem()
for field in fields:
field = scrapy.Field()
return item
But that didn't work. Thanks!

You can manage it by using python magic method __dict__ pretty much as:
fields = ['a', 'b', 'c']
class MyItem(scrapy.Item):
def __init__(self):
for f in fields:
self.__dict__[f] = scrapy.field()
my_item = MyItem()
And so on...
Seems that my previous comment is valid for usual python classes, not for scrapy.Item class. Following code is working:
from scrapy import Item, Field
fields = ['a', 'b', 'c']
def generate_item(fields):
item = Item()
for f in fields:
item.fields[f] = Field()
return item
item_instance = generate_item(fields)
item_instance['a'] = 'some_value'
item_instance['b'] = 12
print(item_instance)

Related

Can't yield paralel requests conducted by items pipeline

In my scrapy code I'm trying to yield the following figures from parliament's website where all the members of parliament (MPs) are listed. Opening the links for each MP, I'm making parallel requests to get the figures I'm trying to count. I didn't use metas here because my code doesn't just make consecutive requests but it makes parallel requests for the figures after the individual page of the MP is requested. Thus I thought item containers would fit my purpose better.
Here are the figures I'm trying to scrape
How many bill proposals that each MP has their signature on
How many question proposals that each MP has their signature on
How many times that each MP spoke on the parliament
In order to count and yield out how many bills has each member of parliament has their signature on, I'm trying to write a scraper on the members of parliament which works with 3 layers:
Starting with the link where all MPs are listed
From (1) accessing the individual page of each MP where the three information defined above is displayed
3a) Requesting the page with bill proposals and counting the number of them by len function
3b) Requesting the page with question proposals and counting the number of them by len function
3c) Requesting the page with speeches and counting the number of them by len function
What I want: I want to yield the inquiries of 3a,3b,3c with the name and the party of the MP
Problem: My code above just doesn't yield anything but empty dictionaries for each request
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three. Therefore I preferred using the pipelines which apparently doesn't work.
Main code:
'''
from scrapy import Spider
from scrapy.http import Request
from ..items import MeclisItem
import logging
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
items = MeclisItem()
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
items['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
items['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback = self.mv_analysis)
pass
def mv_analysis(self, response):
items = MeclisItem()
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link, callback = self.bill_prop_counter) #number of bill proposals to be requested
yield Request(questionprop_link, callback = self.quest_prop_counter) #number of question propoesals to be requested
yield Request(speech_link, callback = self.speech_counter) #number of speeches to be requested
yield items
# COUNTING FUNCTIONS
def bill_prop_counter(self,response):
items = MeclisItem()
billproposals = response.xpath("//tr[#valign='TOP']")
items['bill_prop_count'] = len(billproposals)
pass
def quest_prop_counter(self, response):
items = MeclisItem()
questionproposals = response.xpath("//tr[#valign='TOP']")
items['res_prop_count'] = len(questionproposals)
pass
def speech_counter(self, response):
items = MeclisItem()
speeches = response.xpath("//tr[#valign='TOP']")
items['speech_count'] = len(speeches)
pass
'''
items.py code:
import scrapy
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
pass
What's displayed at scrapy:
I checked many questions on stackoverflow but still couldn't figure a way out. Thanks in advance.
ps: Spent ten minutes seperately to colour the code above and couldn't make it either :(
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three.
You can do it like this:
Edit:
import scrapy
from scrapy import Spider
from scrapy.http import Request
# from ..items import MeclisItem
import logging
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
item = MeclisItem()
item['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
item['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback=self.mv_analysis, cb_kwargs={'item': item})
def mv_analysis(self, response, item):
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link,
callback=self.bill_prop_counter,
cb_kwargs={'item': item, 'questionprop_link': questionprop_link, 'speech_link': speech_link}) #number of bill proposals to be requested
# COUNTING FUNCTIONS
def bill_prop_counter(self, response, item, questionprop_link, speech_link):
billproposals = response.xpath("//tr[#valign='TOP']")
item['bill_prop_count'] = len(billproposals)
yield Request(questionprop_link,
callback=self.quest_prop_counter,
cb_kwargs={'item': item, 'speech_link': speech_link}) #number of question propoesals to be requested
def quest_prop_counter(self, response, item, speech_link):
questionproposals = response.xpath("//tr[#valign='TOP']")
item['res_prop_count'] = len(questionproposals)
yield Request(speech_link,
callback=self.speech_counter,
cb_kwargs={'item': item}) #number of speeches to be requested
def speech_counter(self, response, item):
speeches = response.xpath("//tr[#valign='TOP']")
item['speech_count'] = len(speeches)
yield item

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

Scrapy: How to initialize item with None?

For example, the item in scrapy is:
class CrawlerItem(scrapy.Item):
name = scrapy.Field()
country = scrapy.Field()
title = scrapy.Field()
subject = scrapy.Field()
......
item = CrawlerItem()
How could I initialize all of the fields value of item with None?
I could do this with:
item['name'] = None
item['country'] = None
...
But the problem is, there're far two many fields. Are there any methods that could do:? item.set_all(None)
You just need to iterate through self.fields in the items class
class CrawlerItem(scrapy.Item):
name = scrapy.Field()
country = scrapy.Field()
title = scrapy.Field()
subject = scrapy.Field()
def set_all(self, value):
for keys, _ in self.fields.items():
self[keys] = value
And then in your code you can call
item = CrawlerItem()
item.set_all(None)
print(item)

Scrapy items outputed as tuples instead of list

For whatever reason all items are returned as tuples. Not sure what I am missing. In all other spiders and projects it was just a list (when i use extract()).
{'acne': (None,),
'function': ([u'\u2027Preservative'],),
'function0': u'\u2027Preservative',
'irritant': (None,),
'name': (u'Potassium Sorbate',),
'safety': (u'3',),
'url': 'http://cosdna.com/eng/383bb7435.html'}
Here is my spider code.
def parse(self, response):
inspect_response(response, self)
a = response.xpath('//table//tr')
for i in a:
item = CosdnaExtItem()
item['name'] = i.xpath('./td/a/text()').extract_first(),
item['url'] = i.xpath('./td/a/#href').extract_first(),
item['function'] = i.xpath('.//td[2]/span//text()').extract(),
item['acne'] = i.xpath('.//td[3]/span//text()').extract_first(),
item['irritant'] = i.xpath('.//td[4]/span//text()').extract_first(),
item['safety'] = i.xpath('.//td[5]/div//text()').extract_first(),
yield item
Note extra commas at the end of lines:
item['function'] = i.xpath('.//td[2]/span//text()').extract(),
in Python
x = y,
is the same as
x = (y,)