crawling inside a for loop is not synchronous

crawling inside a for loop is not synchronous - scrapy

Source Code
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
yield scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
My code can be seen attached as an image. As, you can see, inside the for loop. I want Scrapy to return from the function "hotelParse", then continue, executing the for loop.
However, now, it firsts prints all the hotel names, meaning, the for loop get executed completely, then "hotelParse" starts yielding.
This would mess up my output, once, I start assigning values to the item object.

Almost definitely what you're trying to do is the "Passing additional data to callback functions" from the Scrapy documentation. Here's how it would look for your case:
def parse_item(self, response):
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
item = HotelItem()
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
item["hotelName"] = hotelName
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
item["hotel_image"] = hotel_image
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
request = scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
request.meta['item'] = item
yield request
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
yield scrapy.Request(response.urljoin(next_page.extract()), self.parse_item)
def parseHotel(self, response):
item = response.meta['item']
item["extra_1"] = response.xpath('/example/text()').extract_first()
item["extra_2"] = response.xpath('/example2/text()').extract_first()
yield item

Related

How do I connect items from one parse method to another?

'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks

I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...

How to use the yield function to scrape data from multiple pages

I'm trying to scrape data from amazon India website. I am not able collect response and parse the elements using the yield() method when:
1) I have to move from product page to review page
2) I have to move from one review page to another review page
Product page
Review page
Code flow:
1) customerReviewData() calls the getCustomerRatingsAndComments(response)
2) The getCustomerRatingsAndComments(response)
finds the URL of the review page and call the yield request method with getCrrFromReviewPage(request) as callback method, with url of this review page
3) getCrrFromReviewPage() gets new response of the firstreview page and scrape all the elements from the first review page (page loaded) and add it to customerReviewDataList[]
4) get URL of the next page if it exists and recursively call getCrrFromReviewPage() method, and crawl elements from next page, until all the review page is crawled
5) All the reviews gets added to the customerReviewDataList[]
I have tried playing around with yield() changing the parameters and also looked up the scrapy documentation for yield() and Request/Response yield
# -*- coding: utf-8 -*-
import scrapy
import logging
customerReviewDataList = []
customerReviewData = {}
#Get product name in <H1>
def getProductTitleH1(response):
titleH1 = response.xpath('normalize-space(//*[#id="productTitle"]/text())').extract()
return titleH1
def getCustomerRatingsAndComments(response):
#Fetches the relative url
reviewRelativePageUrl = response.css('#reviews-medley-footer a::attr(href)').extract()[0]
if reviewRelativePageUrl:
#get absolute URL
reviewPageAbsoluteUrl = response.urljoin(reviewRelativePageUrl)
yield Request(url = reviewPageAbsoluteUrl, callback = getCrrFromReviewPage())
self.log("yield request complete")
return len(customerReviewDataList)
def getCrrFromReviewPage():
userReviewsAndRatings = response.xpath('//div[#id="cm_cr-review_list"]/div[#data-hook="review"]')
for userReviewAndRating in userReviewsAndRatings:
customerReviewData[reviewTitle] = response.css('#cm_cr-review_list .review-title span ::text').extract()
customerReviewData[reviewDescription] = response.css('#cm_cr-review_list .review-text span::text').extract()
customerReviewDataList.append(customerReviewData)
reviewNextPageRelativeUrl = response.css('#cm_cr-pagination_bar .a-pagination .a-last a::attr(href)')[0].extract()
if reviewNextPageRelativeUrl:
reviewNextPageAbsoluteUrl = response.urljoin(reviewNextPageRelativeUrl)
yield Request(url = reviewNextPageAbsoluteUrl, callback = getCrrFromReviewPage())
class UsAmazonSpider(scrapy.Spider):
name = 'Test_Crawler'
allowed_domains = ['amazon.in']
start_urls = ['https://www.amazon.in/Philips-Trimmer-Cordless-Corded-QT4011/dp/B00JJIDBIC/ref=sr_1_3?keywords=philips&qid=1554266853&s=gateway&sr=8-3']
def parse(self, response):
titleH1 = getProductTitleH1(response),
customerReviewData = getCustomerRatingsAndComments(response)
yield{
'Title_H1' : titleH1,
'customer_Review_Data' : customerReviewData
}
I'm getting the following response:
{'Title_H1': (['Philips Beard Trimmer Cordless and Corded for Men QT4011/15'],), 'customer_Review_Data': <generator object getCustomerRatingsAndComments at 0x048AC630>}
The "Customer_review_Data" should be a list of dict of title and review
I am not able to figure out as to what mistake I am doing here.
When I use the log() or print() to see what data is captured in customerReviewDataList[], unable to see the data in the console either.
I am able to scrape all the reviews in customerReviewDataList[], if they are present in the product page,
In this scenario where I have to use the yield function I am getting the output stated above like this [https://ibb.co/kq8w6cf]
This is the kind of output I am looking for:
{'customerReviewTitle': ['Difficult to find a charger adapter'],'customerReviewComment': ['I already have a phillips trimmer which was only cordless. ], 'customerReviewTitle': ['Good Product'],'customerReviewComment': ['Solves my need perfectly HK']}]}
Any help is appreciated. Thanks in advance.

You should complete the Scrapy tutorial. The Following links section should be specially helpful to you.
This is a simplified version of your code:
def data_request_iterator():
yield Request('https://example.org')
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
yield {
'title': response.css('title::text').get(),
'data': data_request_iterator(),
}
Instead, it should look like this:
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
item = {
'title': response.css('title::text').get(),
}
yield Request('https://example.org', meta={'item': item}, callback=self.parse_data)
def parse_data(self, response):
item = response.meta['item']
# TODO: Extend item with data from this second response as needed.
yield item

How to check if url from xpath exists?

I have two functions in Scrapy
def parse_attr(self, response):
for resource in response.xpath(''):
item = Item()
item['Name'] = response.xpath('').extract()
item['Title'] = response.xpath('').extract()
item['Contact'] = response.xpath('').extract()
item['Gold'] = response.xpath('').extract()
company_page = response.urljoin(resource.xpath('/div/#href').extract_first())
if company_page:
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item
def company_data(self, response):
item = response.meta['item']
item['Products'] = response.xpath('').extract()
yield item
parse_attr calls company_data when it extracts #href from page and it passes it to company_page, however, this href does not always exists. How can i check if href exists, and if not, stop scrapy from moving to other function?
Above code does not satisfy this condition because company_page is always true.
What I want is scrapy to stop if there is no href, and finish its job just with items it already has. If href is found, then I want scrapy to move to other function and extract additional item.

response.urljoin() will always return something (the request's base URL), even if the argument is empty. Therefore your variable will always contain a value and consequently evaluate as True.
You need to do the URL joining inside your conditional. For example:
company_page = resource.xpath('/div/#href').extract_first()
if company_page:
company_page = response.urljoin(company_page)
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item

Looking for a better way to handle all items from one url

I have one spider to crawl a list of urls, like
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
items = []
records = response.xpath('//*[#id="feed-main-list"]/li')
for rec in records:
item = MyItem()
item['spiderUrl'] = response.request.url
item['url'] = rec.xpath('.//*[#class="feed-block-title"]/a/#href').extract_first().strip()
item['title'] = rec.xpath('string(.//*[#class="feed-block-title"])').extract_first().strip()
item['lastUpdate'] = 'success'
items.append(item)
return items
For each url, i need to handle the items together(analyze data, send email if something happens) and as soon as possible. I choose the pipeline to do it. But in pipeline, it just receives item one by one from from.
So, i try to pack the items into one container item in spider.
In spider,
container = ContainerItem()
container['url'] = response.request.url
container['itemist'] = items
return [container]
and in pipeline,
> def process_item(self, item, spider):
> item['itemList']
> n = len(item['itemList'])
> for i in item['itemList']:
> item = dict(i)
> ...
So, my questions are:
1. Is it a good way to implement it according to my requirement?
2. Packing a list of items into one container item like that seems very ugly. Is there any Scrapy-style method to do it?
Thanks!

I think the most logical solution would be to combine all items into one. Nesting in a dictionary is a very common concept and it might appear complicated and dirty but really is optimal and easy as long as you don't go 10 levels deep.
To do that simply wrap your items list in a dictionary, like:
def parse(self, response):
items = []
records = response.xpath('//*[#id="feed-main-list"]/li')
for rec in records:
item = MyItem()
item['spiderUrl'] = response.request.url
item['url'] = rec.xpath('.//*[#class="feed-block-title"]/a/#href').extract_first().strip()
item['title'] = rec.xpath('string(.//*[#class="feed-block-title"])').extract_first().strip()
item['lastUpdate'] = 'success'
items.append(item)
return {'items': items}
Now your pipeline will receive all of the items as one item, that you can unpack, sort and do whatever you wish.
In scrapy this approach is very common and even used with ItemLoader if those are used instead of pure scrapy.Item, which to clarify is just slightly modified python dictionary!

Sequentially crawl website using scrapy

Is there a way to tell scrapy to stop crawling based upon condition in the 2nd level page? I am doing the following:
I have a start_url to begin with (1st level page)
I have set of urls extracted from the start_url using parse(self,
response)
Then I add queue the links using Request with callback as parseDetailPage(self, response)
Under parseDetail (2nd level page) I come to know if I can stop crawling or not
Right now I am using CloseSpider() to accomplish this, but the problem is that the urls to be parsed are already queued by the time I start crawling second level pages and I do not know how to remove them from the queue. Is there a way to sequentially crawl the list of links and then be able to stop in parseDetailPage?
global job_in_range
start_urls = []
start_urls.append("http://sfbay.craigslist.org/sof/")
def __init__(self):
self.job_in_range = True
def parse(self, response):
hxs = HtmlXPathSelector(response)
results = hxs.select('//blockquote[#id="toc_rows"]')
items = []
if results:
links = results.select('.//p[#class="row"]/a/#href')
for link in links:
if link is self.end_url:
break;
nextUrl = link.extract()
isValid = WPUtil.validateUrl(nextUrl);
if isValid:
item = WoodPeckerItem()
item['url'] = nextUrl
item = Request(nextUrl, meta={'item':item},callback=self.parseDetailPage)
items.append(item)
else:
self.error.log('Could not parse the document')
return items
def parseDetailPage(self, response):
if self.job_in_range is False:
raise CloseSpider('End date reached - No more crawling for ' + self.name)
hxs = HtmlXPathSelector(response)
print response
body = hxs.select('//article[#id="pagecontainer"]/section[#class="body"]')
item = response.meta['item']
item['postDate'] = body.select('.//section[#class="userbody"]/div[#class="postinginfos"]/p')[1].select('.//date/text()')[0].extract()
if item['jobTitle'] is 'Admin':
self.job_in_range = False
raise CloseSpider('Stop crawling')
item['jobTitle'] = body.select('.//h2[#class="postingtitle"]/text()')[0].extract()
item['description'] = body.select(str('.//section[#class="userbody"]/section[#id="postingbody"]')).extract()
return item

Do you mean that you would like to stop the spider and resume it without parsing the urls which have been parsed?
If so, you may try to set the JOB_DIR setting. This setting can keep the request.queue in specified file on the disk.

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

crawling inside a for loop is not synchronous - scrapy

Related

How do I connect items from one parse method to another?

How to use the yield function to scrape data from multiple pages

How to check if url from xpath exists?

Looking for a better way to handle all items from one url

Sequentially crawl website using scrapy

Categories

Resources