Scrapy-Selenium Pagination - selenium

Can anyone help me? I'm practicing and I can't understand what I did wrong on pagination! It only returns the first page to me and sometimes an error comes up. When it works, it just returns the first page.
"The source list for the Content Security Policy directive 'frame-src' contains an invalid source '*trackcmp.net' It will be ignored", source: https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)

The problem is that your xpath selector returns None instead of the next page number. Consider changing it from
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
to
next_page = response.xpath("//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()

Related

Changing next page url within scraper and loading

I am trying to get within several urls of a webpage and follow the response to the next parser to grab another set of urls on a page. However, from this page I need to grab the next page urls but I wanted to try this by manipulating the page string by parsing it and then passing this as the next page. However, the scraper crawls but it returns nothing not even the output on the final parser when I load item.
Note: I know that I can grab the next page rather simply with an if-statement on the href. However, I wanted to try something different in case I had to face a situation where I would have to do this.
Here's my scraper:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[#class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/#href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[#class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[#class="listing-results-right clearfix"]//a)[position() mod 3=1]//#href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[#id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[#id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
I know the way of grabbing the urls works as I have tried it on a single url using the following:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
Which is the same as the one used above just a change of variables. this outputs the following links and they work:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
Your use case is suited for using scrapy crawl spider. You can write rules on how to extract links to the properties and how to extract links to the next pages. I have changed your code to use a crawl spider class and I have changed your FEEDS settings to use the recommended settings. FEED_URI and FEED_FORMAT are deprecated in newer versions of scrapy.
Read more about the crawl spider from the docs
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(#class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[#class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()

i write a wrong spider , yet can not find out, who can correct the code?

Spider code is wrong. I created a demon project, but it does not work, kindly check my vs code shot cut & I have no idea for all my spider code and problems.
import scrapy
class EmailSpider(scrapy.Spider):
name='Email'
start_url = [
'http://jsjy.114chn.com/'
]
def parse(self,response):
for Email in response.xpath("//span[#id='lblEmail']"):
yiel{
'email_text': Email.xpath(".//span[#id='lblEmail_text']/p").extract_first()
}
next_page= response.xpath("//li[#class='next']/a/#href").extract_first()
if next_page is not None:
next_page_link= response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback=self.parse)
You have problems with indentation and yield function. Also made some code-style corrections:
import scrapy
class EmailSpider(scrapy.Spider):
name = 'Email'
start_url = ['http://jsjy.114chn.com/']
def parse(self, response):
for email in response.xpath("//span[#id='lblEmail']"):
yield {
'email_text': email.xpath(".//span[#id='lblEmail_text']/p").get()
}
next_page = response.xpath("//li[#class='next']/a/#href").get()
if next_page:
yield scrapy.Request(response.urljoin(next_page))
But since you don't have any #lblEmail elements on page, this spider will not output anything.

How to use the yield function to scrape data from multiple pages

I'm trying to scrape data from amazon India website. I am not able collect response and parse the elements using the yield() method when:
1) I have to move from product page to review page
2) I have to move from one review page to another review page
Product page
Review page
Code flow:
1) customerReviewData() calls the getCustomerRatingsAndComments(response)
2) The getCustomerRatingsAndComments(response)
finds the URL of the review page and call the yield request method with getCrrFromReviewPage(request) as callback method, with url of this review page
3) getCrrFromReviewPage() gets new response of the firstreview page and scrape all the elements from the first review page (page loaded) and add it to customerReviewDataList[]
4) get URL of the next page if it exists and recursively call getCrrFromReviewPage() method, and crawl elements from next page, until all the review page is crawled
5) All the reviews gets added to the customerReviewDataList[]
I have tried playing around with yield() changing the parameters and also looked up the scrapy documentation for yield() and Request/Response yield
# -*- coding: utf-8 -*-
import scrapy
import logging
customerReviewDataList = []
customerReviewData = {}
#Get product name in <H1>
def getProductTitleH1(response):
titleH1 = response.xpath('normalize-space(//*[#id="productTitle"]/text())').extract()
return titleH1
def getCustomerRatingsAndComments(response):
#Fetches the relative url
reviewRelativePageUrl = response.css('#reviews-medley-footer a::attr(href)').extract()[0]
if reviewRelativePageUrl:
#get absolute URL
reviewPageAbsoluteUrl = response.urljoin(reviewRelativePageUrl)
yield Request(url = reviewPageAbsoluteUrl, callback = getCrrFromReviewPage())
self.log("yield request complete")
return len(customerReviewDataList)
def getCrrFromReviewPage():
userReviewsAndRatings = response.xpath('//div[#id="cm_cr-review_list"]/div[#data-hook="review"]')
for userReviewAndRating in userReviewsAndRatings:
customerReviewData[reviewTitle] = response.css('#cm_cr-review_list .review-title span ::text').extract()
customerReviewData[reviewDescription] = response.css('#cm_cr-review_list .review-text span::text').extract()
customerReviewDataList.append(customerReviewData)
reviewNextPageRelativeUrl = response.css('#cm_cr-pagination_bar .a-pagination .a-last a::attr(href)')[0].extract()
if reviewNextPageRelativeUrl:
reviewNextPageAbsoluteUrl = response.urljoin(reviewNextPageRelativeUrl)
yield Request(url = reviewNextPageAbsoluteUrl, callback = getCrrFromReviewPage())
class UsAmazonSpider(scrapy.Spider):
name = 'Test_Crawler'
allowed_domains = ['amazon.in']
start_urls = ['https://www.amazon.in/Philips-Trimmer-Cordless-Corded-QT4011/dp/B00JJIDBIC/ref=sr_1_3?keywords=philips&qid=1554266853&s=gateway&sr=8-3']
def parse(self, response):
titleH1 = getProductTitleH1(response),
customerReviewData = getCustomerRatingsAndComments(response)
yield{
'Title_H1' : titleH1,
'customer_Review_Data' : customerReviewData
}
I'm getting the following response:
{'Title_H1': (['Philips Beard Trimmer Cordless and Corded for Men QT4011/15'],), 'customer_Review_Data': <generator object getCustomerRatingsAndComments at 0x048AC630>}
The "Customer_review_Data" should be a list of dict of title and review
I am not able to figure out as to what mistake I am doing here.
When I use the log() or print() to see what data is captured in customerReviewDataList[], unable to see the data in the console either.
I am able to scrape all the reviews in customerReviewDataList[], if they are present in the product page,
In this scenario where I have to use the yield function I am getting the output stated above like this [https://ibb.co/kq8w6cf]
This is the kind of output I am looking for:
{'customerReviewTitle': ['Difficult to find a charger adapter'],'customerReviewComment': ['I already have a phillips trimmer which was only cordless. ], 'customerReviewTitle': ['Good Product'],'customerReviewComment': ['Solves my need perfectly HK']}]}
Any help is appreciated. Thanks in advance.
You should complete the Scrapy tutorial. The Following links section should be specially helpful to you.
This is a simplified version of your code:
def data_request_iterator():
yield Request('https://example.org')
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
yield {
'title': response.css('title::text').get(),
'data': data_request_iterator(),
}
Instead, it should look like this:
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
item = {
'title': response.css('title::text').get(),
}
yield Request('https://example.org', meta={'item': item}, callback=self.parse_data)
def parse_data(self, response):
item = response.meta['item']
# TODO: Extend item with data from this second response as needed.
yield item

scrapy plash set input value?

I've succesfully been able to load javascript generated html with scrapy-splash. Now I want to set a couple input value's which are not part of a form. As soon as I put in a value the content on the site changes. I haven't found a way to set the input value's and rescrap the adjusted html. Is this possible?
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = (
'https://example.com',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 3}
}
})
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'screener-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
You need to put the input inside a lua_script as someone suggested in the comments, following an example to click a button:
script ="""
function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:runjs('document.getElementsByClassName("nameofbutton").click()'))
assert(splash:wait(0.75))
-- return result as a JSON object
return {
html = splash:html()
}
end
"""
then execute the script like this:
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse_item, meta={
'splash': {
'args': {'lua_source': self.script},
'endpoint': 'execute',
}
})

Scrapy Spider which reads from Warc file

I am looking for a Scrapy Spider that instead of getting URL's and crawls them, it gets as input a WARC file (preferably from S3) and send to the parse method the content.
I actually need to skip all the download phase, that means that from start_requests method i would like to return a Response that will then send to the parse method.
This is what i have so far:
class WarcSpider(Spider):
name = "warc_spider"
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
yield Response(url=url, status=200, body=body, headers=headers)
def parse(self, response):
#code that creates item
pass
Any ideas of what is the Scarpy way of doing that ?
What you want to do is something like this:
class DummyMdw(object):
def process_request(self, request, spider):
record = request.meta['record']
payload = record.payload.read()
headers, body = payload.split('\r\n\r\n', 1)
url=record['WARC-Target-URI']
return Response(url=url, status=200, body=body, headers=headers)
class WarcSpider(Spider):
name = "warc_spider"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {'x.DummyMdw': 1}
}
def start_requests(self):
f = warc.WARCFile(fileobj=gzip.open("file.war.gz"))
for record in f:
if record.type == "response":
yield Request(url, callback=self.parse, meta={'record': record})
def parse(self, response):
#code that creates item
pass