pagination in Scrapy on javascript-driven page navigation via Selenium webdriver

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver - scrapy

I am attempting to paginate through the data table on this page, located below the search form.
My code successfully scrapes the first page and I successfully click the next button (using Selenium) to get the next page of results.
However, attempting to create a Response instance and passing it to self.parse() does not work:
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
Also, even though if you analyze the call stack, I am returning None from self.parse, I get this warning when running this scrapy spider:
The "SignalStartSpider.parse" method is a generator and includes a "return" statement with a value different than None. This could lead to unexpected behaviour. Please see https://docs.python.org/3/reference/simple_stmts.html#the-return-statement for details about the semantics of the "return" statement within generators
warn_on_generator_with_return_value(spider, callback)
Here is my current source code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> calling self.parse again")
return self.parse(r)
else:
print(" **** NEXT IS NONE")
return None
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Instead of recursively calling self.parse it is better to use a while loop and simply re-bind the Response instance with the page_source from Selenium webdriver. working code:
# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver
URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"
class Provider(scrapy.Item):
rank = scrapy.Field()
name = scrapy.Field()
gain = scrapy.Field()
pips = scrapy.Field()
drawdown = scrapy.Field()
trades = scrapy.Field()
type = scrapy.Field()
monthly = scrapy.Field()
# chart = scrapy.Field()
price = scrapy.Field()
age = scrapy.Field()
# added = scrapy.Field()
# action = scrapy.Field()
won = scrapy.Field()
profit_factor = scrapy.Field()
daily = scrapy.Field()
monthly = scrapy.Field()
def raw_page_url(i=1):
"""
Return raw page of 100 results. There are 8 such pages
:param i: which page number
:return:
"""
return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)
class SignalStartSpider(scrapy.Spider):
page = 1
name = 'signalstart'
start_urls = [
# raw_page_url(page),
URL_20
]
def __init__(self):
#self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')
def parse_details(self, response):
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['profit_factor']['xpath'] = "//li[#class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
_, value = html_text.extract_text(elem.get()).split(':')
response.meta["data_row"][field] = value
yield response.meta["data_row"]
def parse(self, response):
print(" >>>>>> URL of the response object is {}".format(response.url))
if len (response.url) > 10:
self.driver.get(response.url)
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
while True:
for provider in response.xpath("//div[#class='row']//tr"):
data_row = Provider()
Behold().show('provider')
details_url = None
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
details_url = datum.css("a::attr(href)").get()
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
if details_url:
yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})
print("------------------------------- next page logic --------------------------------------")
next = self.driver.find_element_by_css_selector('.fa-angle-right')
if next is not None:
print(" **** NEXT IS -NOT- NONE")
next.click()
page_source = self.driver.page_source
r = scrapy.http.HtmlResponse('://!', body=page_source, encoding='utf-8')
print(" >>>> looping self.parse again")
response = r
else:
print(" **** NEXT IS NONE")
break
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)

Related

Scrapy Item Pipeline does not process items to sqlite

i use scrapy to get data from a website (www.imensa.de) and i need to persist those data/items into a sqlite3 database. The Spider works fine, but i think there's something wrong about the pipeline.
Within the pipeline, the new .db-file is created properly with the specified tables and columns. But its not getting populated.
Spider.py
# Import library
from __future__ import absolute_import
import scrapy
from scrapy.loader import Item
from scrapy.loader import ItemLoader
from ..items import Mensen_Table
from ..items import Mensa_Meal
import datetime
from datetime import date
from datetime import timedelta
global base_url
global meal_plan_day_list
meal_plan_day_list = []
# Create Spider class
class mensen_crawler(scrapy.Spider):
# Name of spider
name = 'mensen'
# Website to scrape
allowed_domains = ['imensa.de']
start_urls = ['https://www.imensa.de/']
# STATE_NAME / STATE_LINK
def parse(self, response):
base_url = 'https://www.imensa.de/'
# Loop through response to parse state_name & state_link
for i, (name, link) in enumerate(zip(response.css('.group a::text'),response.css('.group a::attr(href)'))):
# Condition to only get german states (16 states, index starting from 0 -> 15)
if i < 16:
state_name = name.get()
state_link = base_url + link.get()
yield scrapy.Request(state_link, callback=self.parse_layer2, cb_kwargs={'state_name': state_name, 'state_link':state_link})
# CITY_NAME / CITY_LINK
def parse_layer2(self, response, state_name, state_link):
global base_url
base_url = 'https://www.imensa.de/'
for (city, link) in zip(response.css('.group a::text'), response.css('.group a::attr(href)')):
city_name = city.get()
yield print('current_city: ',city_name,' (state: ',state_name,')')
city_link_part = link.get().split('/')[0]
yield print('city_link_part: ', city_link_part)
city_link_real = base_url + city_link_part + '/index.html'
yield scrapy.Request(city_link_real, callback=self.parse_layer3, cb_kwargs={'state_name': state_name, 'state_link':state_link, 'city_name': city_name,'city_link': city_link_real})
# MENSA_NAME/MENSA_LINK
def parse_layer3(self, response, state_name, state_link, city_name, city_link):
base_url = 'https://www.imensa.de/'
for group in response.css('.group'):
uni_name = group.css('h2::text').get()
yield print('UNI_NAME: ',uni_name)
for mensa in group.css('.element'):
mensa_name = mensa.css('a::text').get()
yield print('mensa_name: ',mensa_name,' (state: ',state_name,') (city: ',city_name,') (uni_name: ',uni_name,')')
mensa_link = base_url + city_link.replace('https://www.imensa.de/','').split('/')[0] + '/' + mensa.css('a::attr(href)').get()
yield print('mensa_link: ', mensa_link)
yield scrapy.Request(mensa_link, callback=self.parse_layer4, cb_kwargs={'state_name': state_name,
'state_link':state_link,
'city_name': city_name,
'city_link': city_link,
'uni_name': uni_name,
'mensa_name': mensa_name,
'mensa_link': mensa_link
})
# CREATE MENSA ITEM -----------------------------------------------------------------------------------------------------------------
def parse_layer4(self, response, state_name, state_link, city_name, city_link, uni_name, mensa_name, mensa_link):
l = ItemLoader(item=Mensen_Table(), response=response)
try:
rating_avg = response.css('.aw-ratings-average::text').get()
except:
rating_avg = 0
try:
rating_count = response.css('.aw-ratings-count::text').get()
except:
rating_count = 0
address_list = []
for address_element in response.css('a.panel-body::text'):
address_list.append(address_element.get())
mensa_location = ', '.join(address_list)
yield print('mensa_location: ', mensa_location)
yield print('parse_layer_3 -- rating_avg: ',rating_avg)
yield print('parse_layer_3 -- rating_count: ', rating_count)
l.add_value('state_name', state_name)
l.add_value('state_link', state_link)
l.add_value('city_name', city_name)
l.add_value('city_link', city_link)
l.add_value('uni_name', uni_name)
l.add_value('mensa_name', mensa_name)
l.add_value('mensen_link', mensa_link)
l.add_value('mensa_address', mensa_location)
l.add_value('mensen_rating_avg', rating_avg)
l.add_value('mensen_rating_count', rating_count)
yield l.load_item()
for i,x in enumerate(response.css('.col-md-4.no-padding-xs .list-group')):
if i == 0:
date_list = x.css('.pull-right::text').extract()
day_list = x.css('a::text').extract()
link_list = x.css('a::attr(href)').extract()
yield print('date_list: ',date_list)
yield print('day_list: ', day_list)
yield print('link_list: ',link_list)
yield print('mensa_link: ',mensa_link)
# PROCESS DATE LIST
#------------------------------------------
meal_plan_date_list = []
for ele in date_list:
if ele == 'heute':
today = datetime.date.today().strftime('%d.%m.%Y')
meal_plan_date_list.append(today)
elif ele == 'morgen':
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
meal_plan_date_list.append(tomorrow.strftime('%d.%m.%Y'))
else:
meal_plan_date_list.append(ele)
yield print('meal_plan_date_list: ',meal_plan_date_list)
# PROCESS LINK LIST
#--------------------------------------------
meal_plan_link_list = []
for ele in day_list:
link = mensa_link.replace('index.html','') + ele.lower() + '.html'
meal_plan_link_list.append(link)
yield print('meal_plan_link_list: ',meal_plan_link_list)
#meal_plan_list = []
#meal_plan_prep_list = zip(meal_plan_date_list, day_list, meal_plan_link_list)
#for item in meal_plan_prep_list:
# yield print('meal_plan_list_item: ', item)
for date, day, link in zip(meal_plan_date_list, day_list, meal_plan_link_list):
yield scrapy.Request(link, callback=self.parse_layer5, cb_kwargs={'mensa_name': mensa_name, 'mensa_link': link, 'day': day, 'date': date})
# PARSE MEAL PLAN --------------------------------------------------------------------------
def parse_layer5(self, response, mensa_name, mensa_link, day, date):
for element in response.css('.aw-meal-category'):
for sub in element.css('.aw-meal.row.no-margin-xs'):
l = ItemLoader(item=Mensa_Meal(),response=response,selector=sub)
meal_name = sub.css('p.aw-meal-description::text').get()
try:
meal_price = sub.css('.col-sm-2.no-padding-xs.aw-meal-price::text').get().replace('€','').strip()
except:
meal_price = 0
try:
meal_attributes = sub.css('.small.aw-meal-attributes span::text').extract_first().replace(u'\xa0', u'')
except:
meal_attributes = ''
#if not meal_price == None:
l.add_value('mensa_name', mensa_name)
l.add_value('date_of_meal_plan', date)
l.add_value('meal_name', meal_name)
l.add_value('meal_attributes', meal_attributes)
l.add_value('meal_price', meal_price)
yield l.load_item()
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemadapter import ItemAdapter
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from scrapy.item import Item
class Mensen_Table(scrapy.Item):
state_name = scrapy.Field()
state_link = scrapy.Field()
city_name = scrapy.Field()
city_link = scrapy.Field()
uni_name = scrapy.Field()
mensa_name = scrapy.Field()
mensen_link = scrapy.Field()
mensa_address = scrapy.Field()
mensen_rating_avg = scrapy.Field()
mensen_rating_count = scrapy.Field()
five_star_ratings = scrapy.Field()
four_star_ratings = scrapy.Field()
three_star_ratings = scrapy.Field()
two_star_ratings = scrapy.Field()
one_star_ratings = scrapy.Field()
class Mensa_Meal(scrapy.Item):
mensa_name = scrapy.Field()
date_of_meal_plan = scrapy.Field()
meal_name = scrapy.Field()
meal_attributes = scrapy.Field()
meal_price = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import item
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
import sqlite3
# useful for handling different item types with a single interface
from items import Mensen_Table, Mensa_Meal
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
def create_table_mensen(self):
print('TABLE MENSEN CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS mensen_table (
state_name TEXT,
city_name TEXT,
uni_name TEXT,
mensa_name TEXT,
mensa_address LONG,
mensen_rating_avg FLOAT,
mensen_rating_count TEXT)
""")
def create_table_meal(self):
return print('TABLE MEAL CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS meal_table (
mensa_name TEXT,
date_of_meal_plan DATE,
meal_name LONG,
meal_attributes LONG,
meal_price FLOAT)
""")
def process_item(self, item, spider):
if isinstance(item, Mensen_Table):
print('MENSEN TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO mensen_table (state_name, city_name, uni_name, mensa_name, mensa_address, mensen_rating_avg, mensen_rating_count) VALUES (?, ?, ?, ?, ?, ?, ?)""",
(item['state_name'], item['city_name'], item['uni_name'], item['mensa_name'], item['mensa_address'], item['mensen_rating_avg'], item['mensen_rating_count']))
self.con.commit()
return item
if isinstance(item, Mensa_Meal):
print('MEAL TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO meal_table (mensa_name, date_of_meal_plan, meal_name, meal_attributes, meal_price) VALUES (?, ?, ?, ?, ?)""",
(item['mensa_name'], item['date_of_meal_plan'], item['meal_name'], item['meal_attributes'], item['meal_price']))
self.con.commit()
return item
What am i doing wrong? The items get displayed properly, but the dont reach the database-file.
Any help would be highly appreciated!

The only problem I see is that you never close the connection to the database, although I am not sure that this will solve your issue. It might though and it's good practice to do anyway.
In your pipeline add close_spider method that closes the connection to the database. This method is triggered automatically by the scrapy engine just before it deletes the spider and closes the twisted reactor.
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
...
def close_spider(self, spider):
self.con.close()

Scraper not grabbing all text in list/div

My scraper seems to skip on information, for example I want to extract all the countries and the highway codes belonging to each country. However, when I iterate over this I only get one country and one highway code for each list of numbered highways.
How do I get all the highway routes and their respective countries?
Here's the scraper that I am working with:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose
from itemloaders import ItemLoader
class roadItem(scrapy.Item):
highway_names = Field(output_processor = TakeFirst())
country = Field(output_processor = TakeFirst())
route_name = Field(output_processor = TakeFirst())
class roadSpider(scrapy.Spider):
name = "road"
start_urls = ["https://en.wikipedia.org/w/index.php?title=Category:Lists_of_roads_sharing_the_same_title&pageuntil=092%0AList+of+highways+numbered+92#mw-pages"]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
list_data = response.xpath('(//div[#class="mw-category-group"][last()])[2]//ul')
for items in list_data:
for link in items.xpath(".//a/#href").getall():
yield response.follow(
response.urljoin(link),
callback = self.parse_roads
)
next_page = response.xpath("//div[#id='mw-pages']/a[1]//#href").get()
if next_page is not None:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_roads(self, response):
loader = ItemLoader(roadItem())
loader.add_value("highway_names", response.xpath("//h1[#id='firstHeading']//text()").get())
data = response.xpath("//div[#id='mw-content-text']")
for list_h2 in data:
if list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get():
loader.add_value("country", list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get())
else:
loader.add_value("country", list_h2.xpath("(//h2//span)[1]//text()").get())
if list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
for routes in list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
loader.add_value('route_name', routes)
else:
for routess in list_h2.xpath("//ul//li//#title").getall():
loader.add_value('route_name', routess)
yield loader.load_item()

Can't yield paralel requests conducted by items pipeline

In my scrapy code I'm trying to yield the following figures from parliament's website where all the members of parliament (MPs) are listed. Opening the links for each MP, I'm making parallel requests to get the figures I'm trying to count. I didn't use metas here because my code doesn't just make consecutive requests but it makes parallel requests for the figures after the individual page of the MP is requested. Thus I thought item containers would fit my purpose better.
Here are the figures I'm trying to scrape
How many bill proposals that each MP has their signature on
How many question proposals that each MP has their signature on
How many times that each MP spoke on the parliament
In order to count and yield out how many bills has each member of parliament has their signature on, I'm trying to write a scraper on the members of parliament which works with 3 layers:
Starting with the link where all MPs are listed
From (1) accessing the individual page of each MP where the three information defined above is displayed
3a) Requesting the page with bill proposals and counting the number of them by len function
3b) Requesting the page with question proposals and counting the number of them by len function
3c) Requesting the page with speeches and counting the number of them by len function
What I want: I want to yield the inquiries of 3a,3b,3c with the name and the party of the MP
Problem: My code above just doesn't yield anything but empty dictionaries for each request
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three. Therefore I preferred using the pipelines which apparently doesn't work.
Main code:
'''
from scrapy import Spider
from scrapy.http import Request
from ..items import MeclisItem
import logging
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
items = MeclisItem()
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
items['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
items['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback = self.mv_analysis)
pass
def mv_analysis(self, response):
items = MeclisItem()
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link, callback = self.bill_prop_counter) #number of bill proposals to be requested
yield Request(questionprop_link, callback = self.quest_prop_counter) #number of question propoesals to be requested
yield Request(speech_link, callback = self.speech_counter) #number of speeches to be requested
yield items
# COUNTING FUNCTIONS
def bill_prop_counter(self,response):
items = MeclisItem()
billproposals = response.xpath("//tr[#valign='TOP']")
items['bill_prop_count'] = len(billproposals)
pass
def quest_prop_counter(self, response):
items = MeclisItem()
questionproposals = response.xpath("//tr[#valign='TOP']")
items['res_prop_count'] = len(questionproposals)
pass
def speech_counter(self, response):
items = MeclisItem()
speeches = response.xpath("//tr[#valign='TOP']")
items['speech_count'] = len(speeches)
pass
'''
items.py code:
import scrapy
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
pass
What's displayed at scrapy:
I checked many questions on stackoverflow but still couldn't figure a way out. Thanks in advance.
ps: Spent ten minutes seperately to colour the code above and couldn't make it either :(

Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three.
You can do it like this:
Edit:
import scrapy
from scrapy import Spider
from scrapy.http import Request
# from ..items import MeclisItem
import logging
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
item = MeclisItem()
item['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
item['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback=self.mv_analysis, cb_kwargs={'item': item})
def mv_analysis(self, response, item):
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link,
callback=self.bill_prop_counter,
cb_kwargs={'item': item, 'questionprop_link': questionprop_link, 'speech_link': speech_link}) #number of bill proposals to be requested
# COUNTING FUNCTIONS
def bill_prop_counter(self, response, item, questionprop_link, speech_link):
billproposals = response.xpath("//tr[#valign='TOP']")
item['bill_prop_count'] = len(billproposals)
yield Request(questionprop_link,
callback=self.quest_prop_counter,
cb_kwargs={'item': item, 'speech_link': speech_link}) #number of question propoesals to be requested
def quest_prop_counter(self, response, item, speech_link):
questionproposals = response.xpath("//tr[#valign='TOP']")
item['res_prop_count'] = len(questionproposals)
yield Request(speech_link,
callback=self.speech_counter,
cb_kwargs={'item': item}) #number of speeches to be requested
def speech_counter(self, response, item):
speeches = response.xpath("//tr[#valign='TOP']")
item['speech_count'] = len(speeches)
yield item

How to parse two different items in scrapy?

I am using scrapy 2.1 for parsing a category result page.
There are 2 different things I want to scrape from that site:
Category information like e.g. Title and URL
Product items within that category page
Number 2 works, but I am struggling on how to implement the storage of category info. My first attempt is to create another Item Class CatItem:
class CatItem(scrapy.Item):
title = scrapy.Field() # char -
url = scrapy.Field() # char -
level = scrapy.Field() # int -
class ProductItem(scrapy.Item):
title = scrapy.Field() # char -
Let's parse the page:
def parse_item(self, response):
# save category info
category = CatItem()
category['url'] = response.url
category['title'] = response.url
category['level'] = 1
yield category
# now let's parse all products within that category
for selector in response.xpath("//article//ul/div[#data-qa-id='result-list-entry']"):
product = ProductItem()
product['title'] = selector.xpath(".//a/h2/text()").extract_first()
yield product
My Pipeline:
class mysql_pipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
settings = get_project_settings()
def process_item(self, item, spider):
self.store_db(item, spider)
return item
Now here I don't know how to proceed. There is only one "item" within process_item definition.
How can I pass the category information to the store_db method as well?

You can check item type in your pipeline:
from your_project.items import CatItem, ProductItem
class YourPipeline(object):
...
def process_item(self, item, spider):
if isinstance(item, CatItem):
save_category(item)
return item
UPDATE Simple PoC code:
import scrapy
import csv
from scrapy.crawler import CrawlerProcess
class BooksPipeline(object):
def process_item(self, item, spider):
filename = None
if isinstance(item, CategoryItem):
filename = 'Categories.csv'
elif isinstance(item, BookItem):
filename = 'Books.csv'
with open(filename, 'a', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Title', 'URL'], lineterminator="\n")
writer.writerow(item)
return item
class BooksSpider(scrapy.Spider):
name = "books"
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for book_url in response.xpath('//ol/li//h3/a/#href').getall():
yield scrapy.Request(
url=response.urljoin(book_url),
callback=self.parse_book,
)
def parse_book(self, response):
category = CategoryItem()
category['Title'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/text()').get()
category['URL'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/#href').get()
yield category
book = BookItem()
book['Title'] = response.xpath('//h1/text()').get()
book['URL'] = response.url
yield book
class BookItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
class CategoryItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
if __name__ == "__main__":
process = CrawlerProcess(
{
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'ITEM_PIPELINES': {
'__main__.BooksPipeline': 300,
}
}
)
process.crawl(BooksSpider)
process.start()

ItemLoader doesn't pass the loader context to input processors

my spider: autospd.py
class AutospdSpider(scrapy.Spider):
name = 'autospd'
start_urls = ['http://news.dayoo.com/guangzhou/150960_2.shtml']
dt_ft = "%Y-%m-%d %H:%M"
def parse(self, response):
list_objs = response.css("div.dy-list>div")
for li in list_objs:
loader = AutopjtItemLoader(item=AutopjtItem(), selector=li, context=self.dt_ft)
print(loader.context.items()) #please see print-1
loader.nested_css("h2>a").add_css("title", "::text")
loader.nested_css("h2>a").add_css("url", "::attr(href)")
loader.nested_css("div.txt-area>div.news-time").add_xpath("pub_time", "string()")
yield loader.load_item()
print-1: dict_items([('context', '%Y-%m-%d %H:%M'), ('selector',
\r\n '>), ('response', None), ('item',
{}) ])
items.py
def func(value, loader_context):
print(loader_context.items()) # please see print-2
# ft = loader_context.get("context")
# time_dt = datetime.strptime(value, ft)
return value
class AutopjtItemLoader(ItemLoader):
default_output_processor = TakeFirst()
pub_time_in = MapCompose(func)
class AutopjtItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
pub_time = scrapy.Field()
print-2: [('selector', [2019-06-12 08:59< '>]), ('response',
None), ('item', {})]
Why don't have "context" in loader_context?

def nested_xpath(self, xpath, **context):
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def nested_css(self, css, **context):
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
From the scrapy's source code, if you use nested_css or nested_xpath, you must add your context. eg:
loader.nested_css("div.txt-area>div.news-time", dt_ft=self.dt_ft).add_xpath("pub_time", "string()")

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

pagination in Scrapy on javascript-driven page navigation via Selenium webdriver - scrapy

Related

Scrapy Item Pipeline does not process items to sqlite

Scraper not grabbing all text in list/div

Can't yield paralel requests conducted by items pipeline

How to parse two different items in scrapy?

ItemLoader doesn't pass the loader context to input processors

Categories

Resources