Can't yield paralel requests conducted by items pipeline - scrapy

In my scrapy code I'm trying to yield the following figures from parliament's website where all the members of parliament (MPs) are listed. Opening the links for each MP, I'm making parallel requests to get the figures I'm trying to count. I didn't use metas here because my code doesn't just make consecutive requests but it makes parallel requests for the figures after the individual page of the MP is requested. Thus I thought item containers would fit my purpose better.
Here are the figures I'm trying to scrape
How many bill proposals that each MP has their signature on
How many question proposals that each MP has their signature on
How many times that each MP spoke on the parliament
In order to count and yield out how many bills has each member of parliament has their signature on, I'm trying to write a scraper on the members of parliament which works with 3 layers:
Starting with the link where all MPs are listed
From (1) accessing the individual page of each MP where the three information defined above is displayed
3a) Requesting the page with bill proposals and counting the number of them by len function
3b) Requesting the page with question proposals and counting the number of them by len function
3c) Requesting the page with speeches and counting the number of them by len function
What I want: I want to yield the inquiries of 3a,3b,3c with the name and the party of the MP
Problem: My code above just doesn't yield anything but empty dictionaries for each request
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three. Therefore I preferred using the pipelines which apparently doesn't work.
Main code:
'''
from scrapy import Spider
from scrapy.http import Request
from ..items import MeclisItem
import logging
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
items = MeclisItem()
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
items['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
items['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback = self.mv_analysis)
pass
def mv_analysis(self, response):
items = MeclisItem()
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link, callback = self.bill_prop_counter) #number of bill proposals to be requested
yield Request(questionprop_link, callback = self.quest_prop_counter) #number of question propoesals to be requested
yield Request(speech_link, callback = self.speech_counter) #number of speeches to be requested
yield items
# COUNTING FUNCTIONS
def bill_prop_counter(self,response):
items = MeclisItem()
billproposals = response.xpath("//tr[#valign='TOP']")
items['bill_prop_count'] = len(billproposals)
pass
def quest_prop_counter(self, response):
items = MeclisItem()
questionproposals = response.xpath("//tr[#valign='TOP']")
items['res_prop_count'] = len(questionproposals)
pass
def speech_counter(self, response):
items = MeclisItem()
speeches = response.xpath("//tr[#valign='TOP']")
items['speech_count'] = len(speeches)
pass
'''
items.py code:
import scrapy
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
pass
What's displayed at scrapy:
I checked many questions on stackoverflow but still couldn't figure a way out. Thanks in advance.
ps: Spent ten minutes seperately to colour the code above and couldn't make it either :(

Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three.
You can do it like this:
Edit:
import scrapy
from scrapy import Spider
from scrapy.http import Request
# from ..items import MeclisItem
import logging
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
item = MeclisItem()
item['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
item['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback=self.mv_analysis, cb_kwargs={'item': item})
def mv_analysis(self, response, item):
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link,
callback=self.bill_prop_counter,
cb_kwargs={'item': item, 'questionprop_link': questionprop_link, 'speech_link': speech_link}) #number of bill proposals to be requested
# COUNTING FUNCTIONS
def bill_prop_counter(self, response, item, questionprop_link, speech_link):
billproposals = response.xpath("//tr[#valign='TOP']")
item['bill_prop_count'] = len(billproposals)
yield Request(questionprop_link,
callback=self.quest_prop_counter,
cb_kwargs={'item': item, 'speech_link': speech_link}) #number of question propoesals to be requested
def quest_prop_counter(self, response, item, speech_link):
questionproposals = response.xpath("//tr[#valign='TOP']")
item['res_prop_count'] = len(questionproposals)
yield Request(speech_link,
callback=self.speech_counter,
cb_kwargs={'item': item}) #number of speeches to be requested
def speech_counter(self, response, item):
speeches = response.xpath("//tr[#valign='TOP']")
item['speech_count'] = len(speeches)
yield item

Related

How to store scraped links in Scrapy

I did a lot of searches on the web but I couldn't find anything related or maybe it has to do with the wording used.
Basically, I would like to write a spider that would able to save the scraped links and to check if some other links have been already scraped. Is there any build in function in scrapy to do so?
Many thanks
You can write your own method for this purpose. I have written in my project and you can take reference from this. A dictionary called already_parsed_urls and for every callback, I am updating this dictionary.
You can look at the below code snippet and take reference.
from scrapy.spiders import CrawlSpider
from scrapy_splash import SplashRequest
class Spider(CrawlSpider):
name = 'test'
allowed_domains = []
web_url = ''
start_urls = ['']
counter = 0
already_parsed_urls = {}
wait_time = 3
timeout = '90'
def start_requests(self):
for start_url in self.start_urls:
yield SplashRequest(start_url, callback=self.parse_courses,
args={'wait': self.wait_time, 'timeout': self.timeout})
def parse_courses(self, response):
course_urls = []
yield SplashRequest(course_urls[0], callback=self.parse_items, args={'wait': self.wait_time})
def parse_items(self, response):
if not self.already_parsed_urls.get(response.url):
# Get Program URL
program_url = response.url
self.already_parsed_urls[response.url] = 1
else:
return {}

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Spider is working perfectly but doesn't scrape some results

It's working fine, there are approx 208 product information, but for some product details, it give no results, I've executed those products link separately in scrapy shell, working fine there, but why it misses out 25% off product details?
I've tried rotating user-agents, applied different xpaths, but in vain.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from ..items import AmazonItem
import time
from scrapy.linkextractors import LinkExtractor
import urllib.parse
class QuotesSpider(scrapy.Spider):
name = 'pet'
start_urls = ['https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&qid=1567115653&rnid=1632651031&ref=sr_nr_p_89_1',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=2',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=3',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=4',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=5'
]
def parse(self, response):
links =response.xpath("//h2/a[contains(#href,'/dp')]/#href").extract()
urll = ['https://www.amazon.co.uk' + link for link in links]
urls = urll
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_details)
def parse_details(self, response):
global name1
global sales_rank11
global price1
global prime1
list = AmazonItem()
name = response.xpath(".//*[(#id ='productTitle')]/text()").extract_first()
if name is None:
name1 = name
self.logger.info('skip')
else:
name1 = name.replace('\n', '').strip()
price = response.xpath("//span[#id='price_inside_buybox']/text()").get()
if price is None:
price1 = response.xpath("//span[#class='a-color-price']/text()").get()
if price1 is None:
price1 = 'No Price Avaiable'
self.logger.info('skip')
else:
price1 = price.replace('\n', '').replace(' ','')
prime = response.xpath("//span[#id='price-shipping-message']/b").get()
if prime is None:
prime1 = 'Not Prime'
else:
prime1 = 'Prime'
sales_rank1 = response.xpath("//tr[#id='SalesRank']/td[#class='value']/text()").get()
if sales_rank1 is None:
sales_rank11 = 'No Sales Rank Available'
else:
sales_rank11 = sales_rank1.replace('(','').replace('\n','')
list['Name'] = name1
list['Price'] = price1
list['SalesRank'] = sales_rank11
list['Prime'] = prime1
list['Url'] = response.url
yield list
Box 2 conatins correct information, but box 1 doesn't have data, but if we go to the link, there's data there
That's the product name of box 1's url, woking fine in scrapy, but not in spider.
Is there something am i missing?

How to add multiple variables in one string (URL)

My spider starts off with the start_urls, being:
https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL
Based on a keywords.csv file, located in my resource folder, the keywordsID (number 20035386) will change. Once the number changed, the spider will fetch the data from another product.
I also have a chunk of code which constantly checks the page if isTruncated = true, if that's the case, it will change the page number in the URL to +1. The only problem I am having right now, is that I don't know how to set a second variable in one string (URL). When isTruncated = true the code need to adjust the URL's page number AND keywordsID accordingly. Currently, I only managed to add a variable for the page number.
Currently the chunk of code is:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
However, it should become something like:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/ {keywordsid} ?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
When I run the spider, it will crawl all the pages of the product with keywordsID 20035386, but it will only crawl the first page of all the other products listed in the keywords.csv file.
FULL CODE
./krc/spiders/krc_spider.py
# -*- coding: utf-8 -*-
import scrapy
from krc.items import KrcItem
import json
import os
import csv
import time
import datetime
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
search_text=keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text, "category": category})
def parse(self, response):
category = response.meta["category"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
./krc/resources/keywords.csv
keyword,keywordtype
20035386,Hogedrukreiniger
20035424,Window Vacs
Current Output
When I run the spider it fetches the data from all the page's of the product with keywordsID 20035386. From all the other products with a different keywordsID, only the data from the first page will be fetched.
Use response.meta for this:
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
product_id = keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(product_id)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"category": category, "product_id": product_id})
def parse(self, response):
category = response.meta["category"]
product_id = response.meta["product_id"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{product_id}?page={page}&size=8&isocode=nl-NL".format(page=next_page, product_id=product_id),
callback=self.parse,
meta={'page': next_page, "category": category, "product_id": product_id},
)
I believe you need a nested for when your search_text change.
for [first iterating variable] in [outer loop]: # Outer loop
[do something] # Optional
for [second iterating variable] in [nested loop]: # Nested loop
[do something]
Checks this out, it might help you.
For Loops
I think adding the keyword to the url would be the following. It may or may not need + signs before and after search_text, my knowledge is limited.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/"search_text"?page={page}&size=8&isocode=nl-NL".format(page=next_page),
though I'm not really following what this line is doing, at least the format(search_text) portion of it.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]