How to check if url from xpath exists? - scrapy

I have two functions in Scrapy
def parse_attr(self, response):
for resource in response.xpath(''):
item = Item()
item['Name'] = response.xpath('').extract()
item['Title'] = response.xpath('').extract()
item['Contact'] = response.xpath('').extract()
item['Gold'] = response.xpath('').extract()
company_page = response.urljoin(resource.xpath('/div/#href').extract_first())
if company_page:
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item
def company_data(self, response):
item = response.meta['item']
item['Products'] = response.xpath('').extract()
yield item
parse_attr calls company_data when it extracts #href from page and it passes it to company_page, however, this href does not always exists. How can i check if href exists, and if not, stop scrapy from moving to other function?
Above code does not satisfy this condition because company_page is always true.
What I want is scrapy to stop if there is no href, and finish its job just with items it already has. If href is found, then I want scrapy to move to other function and extract additional item.

response.urljoin() will always return something (the request's base URL), even if the argument is empty. Therefore your variable will always contain a value and consequently evaluate as True.
You need to do the URL joining inside your conditional. For example:
company_page = resource.xpath('/div/#href').extract_first()
if company_page:
company_page = response.urljoin(company_page)
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item

Related

scrapy dosn't stop after yield in python

I'm trying to make a spider that goes through a certain amount of start urls and if the resulting page is the right one I yield another request. The problem is that if I try anyway of not yielding a second request the spider will stop directly. There are no problems if I yield the second request.
Here is the relevant code:
def start_requests(self):
urls = ['https://www.hltv.org' + player for player in self.hashPlayers]
print(len(urls))
for url in urls:
return [scrapy.Request(url=url, callback=self.parse)]
def parse(self, response):
result = response.xpath("//div[#class = 'playerTeam']//a/#href").get()
if result is None:
result = response.xpath("//span[contains(concat(' ',normalize-space(#class),' '),' profile-player-stat-value bold ')]//a/#href").get()
if result is not None:
yield scrapy.Request(
url = "https://www.hltv.org" + result,
callback = self.parseTeam
)
So I want a way to make the spider to continue after I call the parse function and don't yield a request.

How do I connect items from one parse method to another?

'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks
I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...

Looking for a better way to handle all items from one url

I have one spider to crawl a list of urls, like
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
items = []
records = response.xpath('//*[#id="feed-main-list"]/li')
for rec in records:
item = MyItem()
item['spiderUrl'] = response.request.url
item['url'] = rec.xpath('.//*[#class="feed-block-title"]/a/#href').extract_first().strip()
item['title'] = rec.xpath('string(.//*[#class="feed-block-title"])').extract_first().strip()
item['lastUpdate'] = 'success'
items.append(item)
return items
For each url, i need to handle the items together(analyze data, send email if something happens) and as soon as possible. I choose the pipeline to do it. But in pipeline, it just receives item one by one from from.
So, i try to pack the items into one container item in spider.
In spider,
container = ContainerItem()
container['url'] = response.request.url
container['itemist'] = items
return [container]
and in pipeline,
> def process_item(self, item, spider):
> item['itemList']
> n = len(item['itemList'])
> for i in item['itemList']:
> item = dict(i)
> ...
So, my questions are:
1. Is it a good way to implement it according to my requirement?
2. Packing a list of items into one container item like that seems very ugly. Is there any Scrapy-style method to do it?
Thanks!
I think the most logical solution would be to combine all items into one. Nesting in a dictionary is a very common concept and it might appear complicated and dirty but really is optimal and easy as long as you don't go 10 levels deep.
To do that simply wrap your items list in a dictionary, like:
def parse(self, response):
items = []
records = response.xpath('//*[#id="feed-main-list"]/li')
for rec in records:
item = MyItem()
item['spiderUrl'] = response.request.url
item['url'] = rec.xpath('.//*[#class="feed-block-title"]/a/#href').extract_first().strip()
item['title'] = rec.xpath('string(.//*[#class="feed-block-title"])').extract_first().strip()
item['lastUpdate'] = 'success'
items.append(item)
return {'items': items}
Now your pipeline will receive all of the items as one item, that you can unpack, sort and do whatever you wish.
In scrapy this approach is very common and even used with ItemLoader if those are used instead of pure scrapy.Item, which to clarify is just slightly modified python dictionary!

crawling inside a for loop is not synchronous

Source Code
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
yield scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
My code can be seen attached as an image. As, you can see, inside the for loop. I want Scrapy to return from the function "hotelParse", then continue, executing the for loop.
However, now, it firsts prints all the hotel names, meaning, the for loop get executed completely, then "hotelParse" starts yielding.
This would mess up my output, once, I start assigning values to the item object.
Almost definitely what you're trying to do is the "Passing additional data to callback functions" from the Scrapy documentation. Here's how it would look for your case:
def parse_item(self, response):
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
item = HotelItem()
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
item["hotelName"] = hotelName
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
item["hotel_image"] = hotel_image
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
request = scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
request.meta['item'] = item
yield request
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
yield scrapy.Request(response.urljoin(next_page.extract()), self.parse_item)
def parseHotel(self, response):
item = response.meta['item']
item["extra_1"] = response.xpath('/example/text()').extract_first()
item["extra_2"] = response.xpath('/example2/text()').extract_first()
yield item

Sequentially crawl website using scrapy

Is there a way to tell scrapy to stop crawling based upon condition in the 2nd level page? I am doing the following:
I have a start_url to begin with (1st level page)
I have set of urls extracted from the start_url using parse(self,
response)
Then I add queue the links using Request with callback as parseDetailPage(self, response)
Under parseDetail (2nd level page) I come to know if I can stop crawling or not
Right now I am using CloseSpider() to accomplish this, but the problem is that the urls to be parsed are already queued by the time I start crawling second level pages and I do not know how to remove them from the queue. Is there a way to sequentially crawl the list of links and then be able to stop in parseDetailPage?
global job_in_range
start_urls = []
start_urls.append("http://sfbay.craigslist.org/sof/")
def __init__(self):
self.job_in_range = True
def parse(self, response):
hxs = HtmlXPathSelector(response)
results = hxs.select('//blockquote[#id="toc_rows"]')
items = []
if results:
links = results.select('.//p[#class="row"]/a/#href')
for link in links:
if link is self.end_url:
break;
nextUrl = link.extract()
isValid = WPUtil.validateUrl(nextUrl);
if isValid:
item = WoodPeckerItem()
item['url'] = nextUrl
item = Request(nextUrl, meta={'item':item},callback=self.parseDetailPage)
items.append(item)
else:
self.error.log('Could not parse the document')
return items
def parseDetailPage(self, response):
if self.job_in_range is False:
raise CloseSpider('End date reached - No more crawling for ' + self.name)
hxs = HtmlXPathSelector(response)
print response
body = hxs.select('//article[#id="pagecontainer"]/section[#class="body"]')
item = response.meta['item']
item['postDate'] = body.select('.//section[#class="userbody"]/div[#class="postinginfos"]/p')[1].select('.//date/text()')[0].extract()
if item['jobTitle'] is 'Admin':
self.job_in_range = False
raise CloseSpider('Stop crawling')
item['jobTitle'] = body.select('.//h2[#class="postingtitle"]/text()')[0].extract()
item['description'] = body.select(str('.//section[#class="userbody"]/section[#id="postingbody"]')).extract()
return item
Do you mean that you would like to stop the spider and resume it without parsing the urls which have been parsed?
If so, you may try to set the JOB_DIR setting. This setting can keep the request.queue in specified file on the disk.