How to limit levels of link that Scrapy crawler? - scrapy

For this example code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('span small::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
If I the crowler to visit at most 3 levels of links, how to restrict that? Not the total number of links visited, but the level of links relative to the initial link.

You can use DEPTH_LIMIT setting in your spider in order to limit depth
of crawl:
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
custom_settins = {
`DEPTH_LIMIT`:3
}
...

Related

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

Count scraped items from scrapy

Looking to just count the number of things scraped. New to python and scraping just following the example and what to know how to just count the number of times Albert Einstein shows up and print to a json file. Just can not get it to print to file using print, yield, or return.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
i=0
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
i+=1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I found out how to get to the item_scraped_count that shows up in the log output at the end of the spider.
import scrapy
from scrapy import signals
class CountSpider(scrapy.Spider):
name = 'count'
start_urls = ['https://example.com']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CountSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
numcount = str(stats['item_scraped_count'])
Here I can create a csv file with the stats
In scrapy request are made asynchronously, and each request will callback to the parse function indepedently. Your i variable is not an instance variable, so it's scope is limited to each function call.
Even if that wasn't the case, the recursion would turn your counter to 0 in each callback.
I would suggest you to take a look at scrapy items, at the end of the scrapy process it will return a counter with the nr of scraped items. Although that maybe an overkill if you don't want to store anymore information but the number of occurrences of "Albert Einstein".
If that's all you want, you can use a dirtier solution, set your counter var to be a instance var and have parse method to increment it, like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
counter = 0
def parse(self, response):
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
self.counter += 1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Scrapy using start_requests with rules

I can't find any solution for using start_requests with rules, also I haven't seen any example on the Internet with this two. My purpose is simple, I wanna redefine start_request function to get an ability catch all exceptions dunring requests and also use meta in requests. This is a code of my spider:
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['www.oreilly.com']
start_urls = ['https://www.oreilly.com/library/view/practical-postgresql/9781449309770/ch04s05.html']
# Base on scrapy doc
def start_requests(self):
for u in self.start_urls:
yield Request(u, callback=self.parse_item, errback=self.errback_httpbin, dont_filter=True)
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
This code scrape only one page. I try to modify it and instead of:
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
I've tried to use this, based on this answer
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
return self.parse(response)
It seems to work, but it doesn't scrape anything, even if I add parse function to my spider. Does anybody know how to use start_request and rules together? I will be glad any information about this topic. Have a nice coding!
I found a solution, but frankly speaking I don't know how it works but it sertantly does it.
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrapes.com']
start_urls = ['https://books.toscrapes.com']
login_page = 'https://books.toscrapes.com'
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def start_requests(self):
yield Request(url=self.login_page, callback=self.login, errback=self.errback_httpbin, dont_filter=True)
def login(self, response):
return FormRequest.from_response(response)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
To catch errors from your rules you need to define errback for your Rule(). But unfortunately this is not possible now.
You need to parse and yield request by yourself (this way you can use errback) or process each response using middleware.
Here is a solution for handle errback in LinkExtractor
Thanks this dude!

Scrapy doesn't return start_urls as an item

Using the following code, I can get Scrapy to crawl the pages of a site, parse those pages and return the results of each page parse as an item for processing in a pipeline.
My issue is that I cannot work out how to process the start_url page. The start_url never gets passed to the parse_item function.
What am I missing?
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
ignored_extensions = [
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
# other
'css', 'exe', 'bin', 'rss', 'zip', 'rar',
]
rules = [
Rule(LinkExtractor(deny_extensions=ignored_extensions), follow=True, callback='parse_item')
]
def __init__(self, start_url, source, *args, **kwargs):
super(GenericSpider, self).__init__(*args, **kwargs)
#set common settings
Bootstrap.init(self, kwargs)
self.source = source
self.start_urls = [start_url]
self.allowed_domains = [urlparse.urlparse(start_url).hostname]
def parse_item(self, response):
process response and return item ....
You'll want to define parse_start_url, something like the following should do:
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
...
def parse_item(self, response):
process response and return item ....
parse_start_url = parse_item