Scrapy: How to add parameters to each request with start_requests? - scrapy

I am using scrapy 2.1 where I scrapy content from category pages which are paginated. The default result set is 20 and I want to increase this to 1000 in order to request viewer pages.
e.g.
/category1
/category1?VIEW_SIZE=20&VIEW_INDEX=1
wanted:
/category1?VIEW_SIZE=1000&VIEW_INDEX=1
Rules:
rules = (
# parse all index pages
Rule(
LinkExtractor(
allow = ['^.*\/[a-z-0-9]+(\?VIEW_SIZE=20&VIEW_INDEX=\d{0,3}&filterBy=default&sortBy=default)?(?<![\d])$'],
# deny=[''],
restrict_xpaths=(['//aside[#id="aside-st"]', # navigation bar
'//ul[#class="page-lst"]/li[position()<last()]']), # include pagination
),
follow = True,
callback= 'parse_item'
),
)
Overriding start_requests to add parameters to each URL requested:
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, meta={'VIEW_SIZE': 1000})
This does not work. How can I add to each URL I want to scrape the given parameters within the request?

This solved my problem of adding custom parameters to each request:
Add callback with the help of process_links to Rule:
rules = (
Rule(
LinkExtractor(
allow=['^.*\/c\/.*'],
# deny=[''],
restrict_xpaths=(['//li[contains(#class, "navigation--entry")]',
]),
),
process_links='link_filtering',
callback='parse_item',
follow=True
),
)
Add parameters to each URL
# gett max amount of results per category and add n=x results to url
def link_filtering(self, links):
for link in links:
link.url = "%s?p=1&n=1000" % link.url
return links

Related

Scrapy/Selenium: How do I follow the links in 1 webpage?

I am new to web-scraping.
I want to go to Webpage_A and follow all the links there.
Each of the link lead to a page where I can select some button and download the data in an Excel file.
I tried the below code. But I believe there is an error with
if link:
yield SeleniumRequest(
Instead of using "SeleniumRequest" to follow the links, what should I use?
If using pure Scrapy, I know I can use
yield response.follow(
Thank you
class testSpider(scrapy.Spider):
name = 'test_s'
def start_requests(self):
yield SeleniumRequest(
url='CONFIDENTIAL',
wait_time=15,
screenshot=True,
callback=self.parse
)
def parse(self, response):
tables_name = response.xpath("//div[#class='contain wrap:l']//li")
for t in tables_name:
name=t.xpath(".//a/span/text()").get()
link = t.xpath(".//a/#href").get()
if link:
yield SeleniumRequest(
meta={'table_name': name},
url= link,
wait_time=15,
screenshot=True,
callback=self.parse_table
)
def parse_table(self, response):
name = response.request.meta['table_name']
button_select=response.find_element_by_xpath("(//a[text()='Select All'])").click()
button_st_yr=response.find_element_by_xpath("//select[#name='ctl00$ContentPlaceHolder1$StartYearDropDownList'] /option[1]").click()
button_end_mth=response.find_element_by_xpath("//select[#name='ctl00$ContentPlaceHolder1$EndMonthDropDownList']/option[text()='Dec']").click()
button_download=response.find_element_by_xpath("//input[#id='ctl00_ContentPlaceHolder1_DownloadButton']").click()
yield{
'table_name': name
}

Scrapy Amazon Pagination First Few Pages

Currently for pagination in Amazon data scraper using Scrapy, I am using
next_page = response.xpath('//li[#class="a-last"]/a/#href').get()
if next_page:
next_page = 'https://www.amazon.com' + next_page
yield scrapy.Request(url=next_page,callback=self.parse,headers=self.amazon_header,dont_filter=True)
Say if I want to only fetch data from the first 3 pages, How do I do it?
Go to settings.py file and you limit pagination like as follows:
CLOSESPIDER_PAGECOUNT = 3
Alternative:
suppose,
url =[ 'https:// www.quote.toscrape/page=1 something']
Now make pagination this way in start_urls and exclude next
pagination
start_urls =[ '​https:// www.quote.toscrape/page='+str(x)+' something' for x in range(1,3)]

Crawling single pages with scrapy.Spider works but not for entire website with CrawlSpider

Need some help here. My code is working when I am crawling one page via (scrapy.Spider). However once I switch to (CrawlSpider) to scrape the entire website, it does not seems to work at all.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(CrawlSpider):
name = "quotes"
allowed_domains = ['reifen.check24.de']
start_urls = [
'https://reifen.check24.de/pkw-sommerreifen/toyo-proxes-cf2-205-55r16-91h-2276003?label=ppc',
'https://reifen.check24.de/pkw-sommerreifen/michelin-pilot-sport-4-205-55zr16-91w-213777?label=pc'
]
rules = (
Rule(LinkExtractor(deny= ('cart')), callback='parse_item', follow=True),
)
def parse(self, response):
for quote in response.xpath('/html/body/div[2]/div/section/div/div/div[1]'):
yield {
'brand': quote.xpath('//tbody//tr[1]//td[2]//text()').get(),
'pattern': quote.xpath('//tbody//tr[3]//td[2]//text()').get(),
'size': quote.xpath('//tbody//tr[6]//td[2]//text()').get(),
'RR': quote.xpath('div[1]/div[1]/div/div[1]/div[2]/span/span/span/div/div/div[1]/span/text()').get(),
'WL': quote.xpath('div[1]/div[1]/div/div[1]/div[2]/span/span/span/div/div/div[2]/span/text()').get(),
'noise': quote.xpath('div[1]/div[1]/div/div[1]/div[2]/span/span/span/div/div/div[3]/span/text()').get(),
}
Am I missing something?
You have a tiny mistake:
rules = (
Rule(LinkExtractor(deny= ('cart')), callback='parse_item', follow=True),
)
should be:
rules = (
Rule(LinkExtractor(deny= ('cart')), callback='parse', follow=True),
)

How to restrict the area in which LinkExtractor is being applied?

I have a scraper with the following rules:
rules = (
Rule(LinkExtractor(allow=('\S+list=\S+'))),
Rule(LinkExtractor(allow=('\S+list=\S+'))),
Rule(LinkExtractor(allow=('\S+view=1\S+')), callback='parse_archive'),
)
As you can see, the 2nd and 3rd rules are exactly the same.
What I would like to do is tell scrappy extract the links I am interested in by referring to particular places within a page only. For convenience, I am sending you the corresponding XPaths although I would prefer a solution based on BeatifullSoup's syntax.
//*[#id="main_frame"]/tbody/tr[3]/td[2]/table/tbody/tr/td/div/table/tbody/tr/td[1]
//*[#id="main_frame"]/tbody/tr[3]/td[2]/table/tbody/tr/td/div/form/table/tbody/tr[1]
//*[#id="main_frame"]/tbody/tr[3]/td[2]/table/tbody/tr/td/div/form/table/tbody/tr[2]
EDIT:
Let me give you an example. Let's assume that I want to extract the five (out of six) links on the top of Scrapy's Offcial Page:
And here is my spider. Any ideas?
class dmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["scrapy.org"]
start_urls = [
"http://scrapy.org/",
]
rules = (
Rule(LinkExtractor(allow=('\S+/'), restrict_xpaths=('/html/body/div[1]/div/ul')), callback='first_level'),
)
def first_level(self, response):
taco = dmozItem()
taco['basic_url'] = response.url
return taco
This can be done with the restrict_xpaths parameter. See the LxmlLinkExtractor documentation
Edit:
You can also pass a list to restrict_xpaths.
Edit 2:
Full example that should work:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class dmozItem(scrapy.Item):
basic_url = scrapy.Field()
class dmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["scrapy.org"]
start_urls = [
"http://scrapy.org/",
]
def clean_url(value):
return value.replace('/../', '/')
rules = (
Rule(
LinkExtractor(
allow=('\S+/'),
restrict_xpaths=(['.//ul[#class="navigation"]/a[1]',
'.//ul[#class="navigation"]/a[2]',
'.//ul[#class="navigation"]/a[3]',
'.//ul[#class="navigation"]/a[4]',
'.//ul[#class="navigation"]/a[5]']),
process_value=clean_url
),
callback='first_level'),
)
def first_level(self, response):
taco = dmozItem()
taco['basic_url'] = response.url
return taco

Does a scrapy spider download from multiple domains concurrently?

I am trying to scrape 2 domains concurrently. I have created a spider like this:
class TestSpider(CrawlSpider):
name = 'test-spider'
allowed_domains = [ 'domain-a.com', 'domain-b.com' ]
start_urls = [ 'http://www.domain-a.com/index.html',
'http://www.domain-b.com/index.html' ]
rules = (
Rule(LinkExtractor(), follow=True, callback='parse_item'),
)
def parse_item(self, response):
log.msg('parsing ' + response.url, log.DEBUG)
I would expect to see a mix of 'domain-a.com and domain-b.com' entries in the output but I only see domain-a mentioned in the logs. However if I run separate spiders/crawlers I do see both domains scraped concurrently (not actual code but illustrates the point):
def setup_crawler(url):
spider = TestSpider(start_url=url)
crawler = Crawler(get_project_settings())
crawler.configure()
crawler.signals.connect(reactor.stop(), signal=signals.spider_closed)
crawler.crawl(spider)
crawler.start()
setup_crawler('http://www.domain-a.com/index.html')
setup_crawler('http://www.domain-b.com/index.html')
log.start(loglevel=log.DEBUG)
reactor.run()
Thanks