scrapy_selenium, how to pass driver to parse - selenium

I'm running this code using scrapy_selenium but I'm not able to pass the driver to parse_page, can anyone identify what I'm doing wrong?
class LSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url='https://www.url.com',
wait_time=3,
screenshot=True,
callback=self.login
)
def login(self, response):
driver = response.request.meta['driver']
search_input = driver.find_element_by_xpath("(//input[#class='input__input'])[1]")
search_input.send_keys("user")
search_input_password = driver.find_element_by_xpath("(//input[#class='input__input'])[2]")
search_input_password.send_keys("password")
search_input.send_keys(Keys.ENTER)
time.sleep(3)
yield SeleniumRequest(
url='https:url.com/url',
callback=self.parse_page,
screenshot=True
)
def parse_page(self, response):
driver = response.request.meta['driver']
search = driver.find_element_by_xpath("//input[#class='search-box']")

Related

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

Scrapy using start_requests with rules

I can't find any solution for using start_requests with rules, also I haven't seen any example on the Internet with this two. My purpose is simple, I wanna redefine start_request function to get an ability catch all exceptions dunring requests and also use meta in requests. This is a code of my spider:
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['www.oreilly.com']
start_urls = ['https://www.oreilly.com/library/view/practical-postgresql/9781449309770/ch04s05.html']
# Base on scrapy doc
def start_requests(self):
for u in self.start_urls:
yield Request(u, callback=self.parse_item, errback=self.errback_httpbin, dont_filter=True)
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
This code scrape only one page. I try to modify it and instead of:
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
I've tried to use this, based on this answer
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
return self.parse(response)
It seems to work, but it doesn't scrape anything, even if I add parse function to my spider. Does anybody know how to use start_request and rules together? I will be glad any information about this topic. Have a nice coding!
I found a solution, but frankly speaking I don't know how it works but it sertantly does it.
class TSpider(CrawlSpider):
name = 't'
allowed_domains = ['books.toscrapes.com']
start_urls = ['https://books.toscrapes.com']
login_page = 'https://books.toscrapes.com'
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def start_requests(self):
yield Request(url=self.login_page, callback=self.login, errback=self.errback_httpbin, dont_filter=True)
def login(self, response):
return FormRequest.from_response(response)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
def errback_httpbin(self, failure):
self.logger.error('ERRRRROR - {}'.format(failure))
To catch errors from your rules you need to define errback for your Rule(). But unfortunately this is not possible now.
You need to parse and yield request by yourself (this way you can use errback) or process each response using middleware.
Here is a solution for handle errback in LinkExtractor
Thanks this dude!

Scrapy response incomplete get url how to

I would like to parse the value obtained from parse again by connecting to another url. How do I fix it?
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import StackItem
class StackSpider(Spider):
name = "stack"
allowed_domains = ["*"]
global n
#n = 1997
start_urls = ['https://www.melon.com/chart/age/list.htm?chartType=YE&chartGenre=KPOP&chartDate=2010',]
def parse(self, response):
url = 'https://www.melon.com/song/detail.htm?songId='
questions = Selector(response).xpath('//*[#id="frm"]/table/tbody/tr')
for question in questions:
item = StackItem()
item['musicid'] = question.xpath('td/div/input/#value').extract()[0]
item['title'] = question.xpath('td[4]/div/div/div/span/strong/a/#title').extract()
item['artlist'] = question.xpath(
'td[4]/div/div/div[2]/div[1]/a/text()').extract()
item['album'] = question.xpath(
'td[4]/div/div/div[2]/div[2]/a/text()').extract()
item['sunwhi'] = question.xpath(
'td[2]/div/span/text()').extract()[0]
response_url=requests.get(url+musicid)
def parse(self, response):
questions = Selector(response).xpath('//*[#id="downloadfrm"]/div/div/div[2]/div[2]/dl/dd')
for question in questions:
item = StackItem()
item['album'] = question.xpath('a/text()').extract()[0]
yield item
class StackSpider(Spider):
name = "stack"
allowed_domains = ["*"]
global n
#n = 1997
start_urls = ['https://www.melon.com/chart/age/list.htm?chartType=YE&chartGenre=KPOP&chartDate=2010',]
def parse(self, response):
url = 'https://www.melon.com/song/detail.htm?songId='
questions = Selector(response).xpath('//*[#id="frm"]/table/tbody/tr')
for question in questions:
item = StackItem()
item['musicid'] = question.xpath('td/div/input/#value').extract()[0]
item['title'] = question.xpath('td[4]/div/div/div/span/strong/a/#title').extract()
item['artlist'] = question.xpath(
'td[4]/div/div/div[2]/div[1]/a/text()').extract()
item['album'] = question.xpath(
'td[4]/div/div/div[2]/div[2]/a/text()').extract()
item['sunwhi'] = question.xpath(
'td[2]/div/span/text()').extract()[0]
response_url=requests.get(url+musicid)
def parse(self, response):
questions = Selector(response).xpath('//*[#id="downloadfrm"]/div/div/div[2]/div[2]/dl/dd')
for question in questions:
item = StackItem()
item['album'] = question.xpath('a/text()').extract()[0]
yield item

scrapy callback function not entered

I have been searching for a solution for this, but none of those I found worked for me. After spending 2 days already debugging this, I should just ask you guys for help.
The urls look fine. Even I hard code a url before the request code, the callback function still not working.
My code is:
def parse_link(self, response):
print 'lllll', response.url
print 'bbbbb', len(response.body), response.body
def parse(self, response):
hxs = HtmlXPathSelector(response)
issues = hxs.select('//a//#id').extract()
for i in range(len(issues)):
issue = issues[i]
links_2d = hxs.select('//html//body//table[%d+%d]/tr/td//a[contains(text(),"full quotes")]/#href' % (9, i)).extract()
links_2d = list(set(links_2d))
if len(bb) < 1: continue
if len(links_2d) < 1: continue
full_link = links_2d[0]
yield scrapy.Request(url=full_link, callback = self.parse_link)
Try this:
def parse(self, response):
hxs = HtmlXPathSelector(response)
issues = hxs.select('//a//#id').extract()
for i in range(len(issues)):
issue = issues[i]
links_2d = hxs.select('//html//body//table[%d+%d]/tr/td//a[contains(text(),"full quotes")]/#href' % (9, i)).extract()
links_2d = list(set(links_2d))
if len(bb) < 1: continue
if len(links_2d) < 1: continue
full_link = links_2d[0]
yield scrapy.Request(str(full_link), self.parse_link)
def parse_link(self, response):
print 'lllll', response.url
print 'bbbbb', len(response.body), response.body

Scrape multiple URLs with Scrapy

How can I scrape multiple URLs with Scrapy?
Am I forced to make multiple crawlers?
class TravelSpider(BaseSpider):
name = "speedy"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4),"http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Python says:
NameError: name 'i' is not defined
But when I use one URL it works fine!
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)"]
Your python syntax is incorrect, try:
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)] + \
["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
If you need to write code to generate start requests, you can define a start_requests() method instead of using start_urls.
You can initialize start_urls in __init__.py method:
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class TravelItem(Item):
url = Field()
class TravelSpider(BaseSpider):
def __init__(self, name=None, **kwargs):
self.start_urls = []
self.start_urls.extend(["http://example.com/category/top/page-%d/" % i for i in xrange(4)])
self.start_urls.extend(["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)])
super(TravelSpider, self).__init__(name, **kwargs)
name = "speedy"
allowed_domains = ["example.com"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Hope that helps.
There are only four ranges in Python: LEGB, because the local scope of the class definition and the local extent of the list derivation are not nested functions, so they do not form the Enclosing scope.Therefore, they are two separate local scopes that cannot be accessed from each other.
so, don't use 'for' and class variables at the same time