Cycle in Scrapy ItemLoader - scrapy

https://doc.scrapy.org/en/latest/topics/loaders.html#using-item-loaders-to-populate-items
from scrapy.loader import ItemLoader
from myproject.items import Product
def parse(self, response):
l = ItemLoader(item=Product(), response=response)
l.add_xpath('name', '//div[#class="product_name"]')
l.add_xpath('name', '//div[#class="product_title"]')
l.add_xpath('price', '//p[#id="price"]')
l.add_css('stock', 'p#stock]')
l.add_value('last_updated', 'today') # you can also use literal values
return l.load_item()
But if I get from webpage 2 names, prices, etc, how to add it to l.load_item() ?
Because I added the cycle but if in the end, I write return cycle will work only once.
How to right to do it?

Just replace return l.load_item() by yield l.load_item()
example:
for block in response.css('.blocks'):
product_name = block.css('div.product_name').extract_first()
product_title = block.css('div.product_title').extract_first()
price = block.css('p#price').extract_first()
stock = block.css('p#stock').extract_first()
yield Product(
product_name=product_name,
product_title=product_title,
price=price,
stock=stock,
last_updated='today'
)
If you use the ItemLoader, you have to reload your variable for each iteration
for block in blocks:
l = ItemLoader(item=Product(), response=response)
...
yield l.load_item()
"In the callback function, you parse the response (web page) and return either dicts with extracted data, Item objects, Request objects, or an iterable of these objects." See scrapy documentation
yield is used to generate an iterable of your Product Item objects

Related

How do I connect items from one parse method to another?

'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks
I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...

How to use the yield function to scrape data from multiple pages

I'm trying to scrape data from amazon India website. I am not able collect response and parse the elements using the yield() method when:
1) I have to move from product page to review page
2) I have to move from one review page to another review page
Product page
Review page
Code flow:
1) customerReviewData() calls the getCustomerRatingsAndComments(response)
2) The getCustomerRatingsAndComments(response)
finds the URL of the review page and call the yield request method with getCrrFromReviewPage(request) as callback method, with url of this review page
3) getCrrFromReviewPage() gets new response of the firstreview page and scrape all the elements from the first review page (page loaded) and add it to customerReviewDataList[]
4) get URL of the next page if it exists and recursively call getCrrFromReviewPage() method, and crawl elements from next page, until all the review page is crawled
5) All the reviews gets added to the customerReviewDataList[]
I have tried playing around with yield() changing the parameters and also looked up the scrapy documentation for yield() and Request/Response yield
# -*- coding: utf-8 -*-
import scrapy
import logging
customerReviewDataList = []
customerReviewData = {}
#Get product name in <H1>
def getProductTitleH1(response):
titleH1 = response.xpath('normalize-space(//*[#id="productTitle"]/text())').extract()
return titleH1
def getCustomerRatingsAndComments(response):
#Fetches the relative url
reviewRelativePageUrl = response.css('#reviews-medley-footer a::attr(href)').extract()[0]
if reviewRelativePageUrl:
#get absolute URL
reviewPageAbsoluteUrl = response.urljoin(reviewRelativePageUrl)
yield Request(url = reviewPageAbsoluteUrl, callback = getCrrFromReviewPage())
self.log("yield request complete")
return len(customerReviewDataList)
def getCrrFromReviewPage():
userReviewsAndRatings = response.xpath('//div[#id="cm_cr-review_list"]/div[#data-hook="review"]')
for userReviewAndRating in userReviewsAndRatings:
customerReviewData[reviewTitle] = response.css('#cm_cr-review_list .review-title span ::text').extract()
customerReviewData[reviewDescription] = response.css('#cm_cr-review_list .review-text span::text').extract()
customerReviewDataList.append(customerReviewData)
reviewNextPageRelativeUrl = response.css('#cm_cr-pagination_bar .a-pagination .a-last a::attr(href)')[0].extract()
if reviewNextPageRelativeUrl:
reviewNextPageAbsoluteUrl = response.urljoin(reviewNextPageRelativeUrl)
yield Request(url = reviewNextPageAbsoluteUrl, callback = getCrrFromReviewPage())
class UsAmazonSpider(scrapy.Spider):
name = 'Test_Crawler'
allowed_domains = ['amazon.in']
start_urls = ['https://www.amazon.in/Philips-Trimmer-Cordless-Corded-QT4011/dp/B00JJIDBIC/ref=sr_1_3?keywords=philips&qid=1554266853&s=gateway&sr=8-3']
def parse(self, response):
titleH1 = getProductTitleH1(response),
customerReviewData = getCustomerRatingsAndComments(response)
yield{
'Title_H1' : titleH1,
'customer_Review_Data' : customerReviewData
}
I'm getting the following response:
{'Title_H1': (['Philips Beard Trimmer Cordless and Corded for Men QT4011/15'],), 'customer_Review_Data': <generator object getCustomerRatingsAndComments at 0x048AC630>}
The "Customer_review_Data" should be a list of dict of title and review
I am not able to figure out as to what mistake I am doing here.
When I use the log() or print() to see what data is captured in customerReviewDataList[], unable to see the data in the console either.
I am able to scrape all the reviews in customerReviewDataList[], if they are present in the product page,
In this scenario where I have to use the yield function I am getting the output stated above like this [https://ibb.co/kq8w6cf]
This is the kind of output I am looking for:
{'customerReviewTitle': ['Difficult to find a charger adapter'],'customerReviewComment': ['I already have a phillips trimmer which was only cordless. ], 'customerReviewTitle': ['Good Product'],'customerReviewComment': ['Solves my need perfectly HK']}]}
Any help is appreciated. Thanks in advance.
You should complete the Scrapy tutorial. The Following links section should be specially helpful to you.
This is a simplified version of your code:
def data_request_iterator():
yield Request('https://example.org')
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
yield {
'title': response.css('title::text').get(),
'data': data_request_iterator(),
}
Instead, it should look like this:
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
item = {
'title': response.css('title::text').get(),
}
yield Request('https://example.org', meta={'item': item}, callback=self.parse_data)
def parse_data(self, response):
item = response.meta['item']
# TODO: Extend item with data from this second response as needed.
yield item

Is it possible to pass a variable from start_requests() to parse() for each individual request?

I'm using a loop to generate my requests inside start_request() and I'd like to pass the index to parse() so it can store it in the item. However when I use self.i the output has the i max value (last loop turn) for every items. I can use response.url.re('regex to extract the index') but I wonder if there is a clean way to pass a variable from start_requests to parse.
You can use scrapy.Request meta attribute:
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
urls = [...]
for index, url in enumerate(urls):
yield scrapy.Request(url, meta={'index':index})
def parse(self, response):
print(response.url)
print(response.meta['index'])
You can pass cb_kwargs argument to scrapy.Request()
https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.cb_kwargs
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
urls = [...]
for index, url in enumerate(urls):
yield scrapy.Request(url, callback=self.parse, cb_kwargs={'index':index})
def parse(self, response, index):
pass

Looking for a better way to handle all items from one url

I have one spider to crawl a list of urls, like
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
items = []
records = response.xpath('//*[#id="feed-main-list"]/li')
for rec in records:
item = MyItem()
item['spiderUrl'] = response.request.url
item['url'] = rec.xpath('.//*[#class="feed-block-title"]/a/#href').extract_first().strip()
item['title'] = rec.xpath('string(.//*[#class="feed-block-title"])').extract_first().strip()
item['lastUpdate'] = 'success'
items.append(item)
return items
For each url, i need to handle the items together(analyze data, send email if something happens) and as soon as possible. I choose the pipeline to do it. But in pipeline, it just receives item one by one from from.
So, i try to pack the items into one container item in spider.
In spider,
container = ContainerItem()
container['url'] = response.request.url
container['itemist'] = items
return [container]
and in pipeline,
> def process_item(self, item, spider):
> item['itemList']
> n = len(item['itemList'])
> for i in item['itemList']:
> item = dict(i)
> ...
So, my questions are:
1. Is it a good way to implement it according to my requirement?
2. Packing a list of items into one container item like that seems very ugly. Is there any Scrapy-style method to do it?
Thanks!
I think the most logical solution would be to combine all items into one. Nesting in a dictionary is a very common concept and it might appear complicated and dirty but really is optimal and easy as long as you don't go 10 levels deep.
To do that simply wrap your items list in a dictionary, like:
def parse(self, response):
items = []
records = response.xpath('//*[#id="feed-main-list"]/li')
for rec in records:
item = MyItem()
item['spiderUrl'] = response.request.url
item['url'] = rec.xpath('.//*[#class="feed-block-title"]/a/#href').extract_first().strip()
item['title'] = rec.xpath('string(.//*[#class="feed-block-title"])').extract_first().strip()
item['lastUpdate'] = 'success'
items.append(item)
return {'items': items}
Now your pipeline will receive all of the items as one item, that you can unpack, sort and do whatever you wish.
In scrapy this approach is very common and even used with ItemLoader if those are used instead of pure scrapy.Item, which to clarify is just slightly modified python dictionary!

crawling inside a for loop is not synchronous

Source Code
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
yield scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
My code can be seen attached as an image. As, you can see, inside the for loop. I want Scrapy to return from the function "hotelParse", then continue, executing the for loop.
However, now, it firsts prints all the hotel names, meaning, the for loop get executed completely, then "hotelParse" starts yielding.
This would mess up my output, once, I start assigning values to the item object.
Almost definitely what you're trying to do is the "Passing additional data to callback functions" from the Scrapy documentation. Here's how it would look for your case:
def parse_item(self, response):
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
item = HotelItem()
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
item["hotelName"] = hotelName
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
item["hotel_image"] = hotel_image
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
request = scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
request.meta['item'] = item
yield request
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
yield scrapy.Request(response.urljoin(next_page.extract()), self.parse_item)
def parseHotel(self, response):
item = response.meta['item']
item["extra_1"] = response.xpath('/example/text()').extract_first()
item["extra_2"] = response.xpath('/example2/text()').extract_first()
yield item