Scrapy doesn't return start_urls as an item - scrapy

Using the following code, I can get Scrapy to crawl the pages of a site, parse those pages and return the results of each page parse as an item for processing in a pipeline.
My issue is that I cannot work out how to process the start_url page. The start_url never gets passed to the parse_item function.
What am I missing?
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
ignored_extensions = [
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
# other
'css', 'exe', 'bin', 'rss', 'zip', 'rar',
]
rules = [
Rule(LinkExtractor(deny_extensions=ignored_extensions), follow=True, callback='parse_item')
]
def __init__(self, start_url, source, *args, **kwargs):
super(GenericSpider, self).__init__(*args, **kwargs)
#set common settings
Bootstrap.init(self, kwargs)
self.source = source
self.start_urls = [start_url]
self.allowed_domains = [urlparse.urlparse(start_url).hostname]
def parse_item(self, response):
process response and return item ....

You'll want to define parse_start_url, something like the following should do:
class GenericSpider(CrawlSpider):
name = "Generic"
allowed_domains = []
start_urls = []
...
def parse_item(self, response):
process response and return item ....
parse_start_url = parse_item

Related

How to set headless = Flase in scrapy-playwright?

In scrapy-playwright, how to set "headless = False". I am trying something like this.
def start_requests(self):
yield scrapy.Request(
url=url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector = "//input[#name='typeAheadInputField']"),
]
)
)
return super().start_requests()
def parse(self, response):
pass
settings.py
PLAYWRIGHT_LAUNCH_OPTIONS = {"headless": False}

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

How to limit levels of link that Scrapy crawler?

For this example code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('span small::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
If I the crowler to visit at most 3 levels of links, how to restrict that? Not the total number of links visited, but the level of links relative to the initial link.
You can use DEPTH_LIMIT setting in your spider in order to limit depth
of crawl:
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
custom_settins = {
`DEPTH_LIMIT`:3
}
...

scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme

I collect links from several directories and then insert them into start_urls as a link variable
import scrapy
class SplashSpider(scrapy.Spider):
f = open('text.txt')
links = f.read()
name = 'spide'
start_urls = [str(links)]
f.close()
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
yield{
'Title': title,
'Main Info': description,
}
but I'm catching an error: scrapy.exceptions.NotSupported: Unsupported URL scheme '': no handler available for that scheme
my text.txt file:
'https:// url1.com','https:// url2.com', ... , 'https:// url300000.com', 'https:// url300001.com'
import scrapy
class SplashSpider(scrapy.Spider):
with open('text.txt') as f:
links = f.readlines()
links = list(map(lambda x: x.strip().replace(' ', ''), links))
name = 'spider'
start_urls = links
def parse(self, response):
title = response.css('.title::text').extract()
description = response.css("div#desc").extract()
title = list(map(str.strip, title))
description = list(map(str.strip, description))
yield{
'Title': title,
'Main Info': description,
}

Scrapy using start_requests with rules

I can't find any solution for using start_requests with rules, also I haven't seen any example on the Internet with this two. My purpose is simple, I wanna redefine start_request function to get an ability catch all exceptions dunring requests and also use meta in requests. This is a code of my spider:
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['www.oreilly.com']
start_urls = ['https://www.oreilly.com/library/view/practical-postgresql/9781449309770/ch04s05.html']
# Base on scrapy doc
def start_requests(self):
for u in self.start_urls:
yield Request(u, callback=self.parse_item, errback=self.errback_httpbin, dont_filter=True)
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
This code scrape only one page. I try to modify it and instead of:
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
I've tried to use this, based on this answer
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
return self.parse(response)
It seems to work, but it doesn't scrape anything, even if I add parse function to my spider. Does anybody know how to use start_request and rules together? I will be glad any information about this topic. Have a nice coding!
I found a solution, but frankly speaking I don't know how it works but it sertantly does it.
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrapes.com']
start_urls = ['https://books.toscrapes.com']
login_page = 'https://books.toscrapes.com'
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def start_requests(self):
yield Request(url=self.login_page, callback=self.login, errback=self.errback_httpbin, dont_filter=True)
def login(self, response):
return FormRequest.from_response(response)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
To catch errors from your rules you need to define errback for your Rule(). But unfortunately this is not possible now.
You need to parse and yield request by yourself (this way you can use errback) or process each response using middleware.
Here is a solution for handle errback in LinkExtractor
Thanks this dude!