I'm trying to access the response (url as condition) from a process_links function so I can rewrite the URL. Is there any way to do this? Currently, I get the error: process_links() takes exactly 3 arguments (2 given)
class Spider(CrawlSpider):
name = 'spider_1'
allowed_domains = 'domain.com',
start_urls = (
'http://domain.com/new/1.html?content=image',
'http://domain.com/new/1.html?content=video',
)
rules = [
Rule(LinkExtractor(allow = (), restrict_xpaths=('//div[#class="pagination"]')), callback='parse_page', process_links='process_links', follow=True)
]
def process_links(self, links, resp):
for link in links:
if 'content=photo' in resp.url:
link.url = "%s?content=photo" % link.url
else:
link.url = "%s?content=video" % link.url
return links
Change
def process_links(self, links, resp):
to
def process_links(self, links):
You expect to receive the response in your function, but Scrapy give you just the links.
Maybe something like that could be what you want:
rules = [
Rule(LinkExtractor(allow = ('content=photo'), restrict_xpaths=('//div[#class="pagination"]')), callback='parse_page', process_links='process_photo_links', follow=True),
Rule(LinkExtractor(allow = (), restrict_xpaths=('//div[#class="pagination"]')), callback='parse_page', process_links='process_video_links', follow=True),
]
def process_photo_links(self, links, resp):
for link in links:
link.url = "%s?content=photo" % link.url
return links
def process_video_links(self, links, resp):
for link in links:
link.url = "%s?content=video" % link.url
return links
Update after comment:
Yes, Scrapy does pass the response to process_links.
You could simple ignore the rules and generate the request your self:
def parse_page(self, response):
...
links = LinkExtractor(allow = (), restrict_xpaths=('//div[#class="pagination"]')).extract_links(response)
for link in links:
if 'content=photo' in response.url:
link.url = "%s?content=photo" % link.url
else:
link.url = "%s?content=video" % link.url
yield scrapy.Request(link.url, callback=self.parse_page)
Related
i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)
For this example code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('span small::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
If I the crowler to visit at most 3 levels of links, how to restrict that? Not the total number of links visited, but the level of links relative to the initial link.
You can use DEPTH_LIMIT setting in your spider in order to limit depth
of crawl:
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
custom_settins = {
`DEPTH_LIMIT`:3
}
...
I can't find any solution for using start_requests with rules, also I haven't seen any example on the Internet with this two. My purpose is simple, I wanna redefine start_request function to get an ability catch all exceptions dunring requests and also use meta in requests. This is a code of my spider:
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['www.oreilly.com']
start_urls = ['https://www.oreilly.com/library/view/practical-postgresql/9781449309770/ch04s05.html']
# Base on scrapy doc
def start_requests(self):
for u in self.start_urls:
yield Request(u, callback=self.parse_item, errback=self.errback_httpbin, dont_filter=True)
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
This code scrape only one page. I try to modify it and instead of:
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
I've tried to use this, based on this answer
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
return self.parse(response)
It seems to work, but it doesn't scrape anything, even if I add parse function to my spider. Does anybody know how to use start_request and rules together? I will be glad any information about this topic. Have a nice coding!
I found a solution, but frankly speaking I don't know how it works but it sertantly does it.
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrapes.com']
start_urls = ['https://books.toscrapes.com']
login_page = 'https://books.toscrapes.com'
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def start_requests(self):
yield Request(url=self.login_page, callback=self.login, errback=self.errback_httpbin, dont_filter=True)
def login(self, response):
return FormRequest.from_response(response)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
To catch errors from your rules you need to define errback for your Rule(). But unfortunately this is not possible now.
You need to parse and yield request by yourself (this way you can use errback) or process each response using middleware.
Here is a solution for handle errback in LinkExtractor
Thanks this dude!
I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item
I'm interested in get contractors data for atlanta from this page:
http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658
So for I can open the links for the categories
'Additions & Remodeling'
'Architects & Engineers'
'Fountains & Ponds'
......
.....
.....
But I can open only the first page:
http://www.1800contractor.com/d.Additions-Remodeling.Atlanta.GA.-12001.html?startingIndex=0&showDirectory=true
I'm trying to open get the next one with the links of the 'Next' button:
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
But it makes no difference.
This is the code of my spider:
import scrapy
class Spider_1800(scrapy.Spider):
name = '1800contractor'
allowed_domains = ['1800contractor.com']
start_urls = (
'http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658',
)
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/#href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
# process next page
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
request = scrapy.Request(absolute_next_page_url)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/#href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
You are not paginating the right request, parse handles the request generated with the urls in start_urls, which means that you need to enter each category in http://www.1800contractor.com/d.Atlanta.GA.html?link_id=3658 first.
def parse(self, response):
urls = response.xpath('/html/body/center/table/tr/td[2]/table/tr[6]/td/table/tr[2]/td/b/a/#href').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name = response.xpath(
'/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/#href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('/html/body/div[1]/center/table/tr[8]/td[2]/a/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)
After hitting start_url your xpath for picking url for contractors are not working. Next page is present on contractor page therefore its called after contractor url. this will work for you
def parse(self, response):
urls = response.xpath('//table//*[#class="hiCatNaked"]').extract()
for url in urls:
absolute_url = response.urljoin(url)
request = scrapy.Request(
absolute_url, callback=self.parse_contractors)
yield request
def parse_contractors(self, response):
name=response.xpath('/html/body/div[1]/center/table/tr[5]/td/table/tr[1]/td/b/a/#href').extract()
contrator = {
'name': name,
'url': response.url}
yield contrator
next_page_url = response.xpath('//a[b[contains(.,'Next')]]/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url, callback=self.parse_contractors)