how to pass 'meta' in a loop in scrapy - scrapy

def district_parse(self,response):
item2 = response.meta["item"]
Data2 = json.loads(re.findall(r'[(](.*?)[)]', response.text)[0])
detail2 = Data2['data']['list']
for loop1 in detail2:
item2["district_time"] = loop1["timeHuman"]
item2["district_name"] = loop1["district_name"]
item2["district_congest"] = loop1["index"]
item2["district_speed"] = loop1["speed"]
item2["district_length"] = loop1["length"]
url = "https://www.example.com"
yield Request(url, meta={"item2": copy.deepcopy(item2)}, callback=self.curvehistory_parse)
'detail2' is a list containing multiple dictionaries. I want to pass multiple dictionaries as parameters to ‘meta‘ in turn, but only the first dictionary can always be passed in.How to fix it?

Create a new method after district_parse, called 'parse_details' which will do the yield to output file.
add a callback in district_parse, eg callback=self.parse_details
and pass the meta dict to the new method one at a time to yield your output from the 'parse_details' method.

Related

How do I connect items from one parse method to another?

'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks
I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...

I want to extract the numbers at the end of a url using regular expressions in scrapy

I want to get '25430989' from the end of this url.
https://www.example.com/cars-for-sale/2007-ford-focus-1-6-diesel/25430989
How would I write it using the xpath?
I get the link using this xpath:
link = row.xpath('.//a/#href').get()
When I use a regex tester I can isolate it with r'(\d+)$ but when I put it into my code it doesn't work for some reason.
import scrapy
import re
from ..items import DonedealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.donedeal.ie']
start_urls = ['https://www.donedeal.ie/all?source=private&sort=publishdate%20desc']
def parse(self, response):
items = DonedealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
if row.xpath('.//ul[#class="card__body-keyinfo"]/li[contains(text(),"0 min")]/text()'):
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
title = row.xpath('.//p[#class="card__body-title"]/text()').get()
county = row.xpath('.//li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
items['link'] = link
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
yield items
I'm trying to get the linkid.
The problem is here
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
When you use the .get() method it returns a string that is saved in the link variable, and strings don't have a .re() method for you to call. You can use one of the methods from the re module (docs for reference).
I would use re.findall(), it will return you a list of values that matches the regex (in this case only one item would return), or None if nothing matches. re.search() is also a good choice, but will return you an re.Match object.
import re #Don't forget to import it
...
link = row.xpath('.//a/#href').get()
linkid = re.findall(r'(\d+)$', link)
Now, the Scrapy selectors also support regex, so an alternative would be implementing it like this: (No need for re module)
linkid = row.xpath('.//a/#href').re_first(r'(\d+)$')
Notice I didn't use .get() there.

How to check if url from xpath exists?

I have two functions in Scrapy
def parse_attr(self, response):
for resource in response.xpath(''):
item = Item()
item['Name'] = response.xpath('').extract()
item['Title'] = response.xpath('').extract()
item['Contact'] = response.xpath('').extract()
item['Gold'] = response.xpath('').extract()
company_page = response.urljoin(resource.xpath('/div/#href').extract_first())
if company_page:
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item
def company_data(self, response):
item = response.meta['item']
item['Products'] = response.xpath('').extract()
yield item
parse_attr calls company_data when it extracts #href from page and it passes it to company_page, however, this href does not always exists. How can i check if href exists, and if not, stop scrapy from moving to other function?
Above code does not satisfy this condition because company_page is always true.
What I want is scrapy to stop if there is no href, and finish its job just with items it already has. If href is found, then I want scrapy to move to other function and extract additional item.
response.urljoin() will always return something (the request's base URL), even if the argument is empty. Therefore your variable will always contain a value and consequently evaluate as True.
You need to do the URL joining inside your conditional. For example:
company_page = resource.xpath('/div/#href').extract_first()
if company_page:
company_page = response.urljoin(company_page)
request = scrapy.Request(company_page, callback = self.company_data)
request.meta['item'] = item
yield request
else:
yield item

Loop on scrapy FormRequest but only one item created

So I've tried to loop on a formrequest that call my function that create, fill and yield the item, only pb : only one and only one item is done no matter how many times he looped and I can't figure out why ?
def access_data(self, res):
#receive all ID and request the infos
res_json = (res.body).decode("utf-8")
res_json = json.loads(res_json)
for a in res_json['data']:
logging.warning(a['id'])
req = FormRequest(
url='https://my_url',
cookies={my_cookies},
method='POST',
callback=self.fill_details,
formdata={'valeur': str(a['id'])},
headers={'X-Requested-With': 'XMLHttpRequest'}
)
yield req
def fill_details(self, res):
logging.warning("annonce")
item = MyItem()
item['html'] = res.xpath('//body//text()')
item['existe'] = True
item['ip_proxy'] = None
item['launch_time'] = str(mySpider.init_time)
yield item
To be sure everything is clear :
When I run this, the log "annonce" is printed only one time while my logging a['id'] in my request loop is printed a lot and i can't find a way to fix this
I found the way !
If any one has the same pb : as my url is always the same (only formdata change) the scrapy filter is taking the control and destroy the duplicates.
Activate dont_filter to true in the formrequest to make it works

crawling inside a for loop is not synchronous

Source Code
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
yield scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
My code can be seen attached as an image. As, you can see, inside the for loop. I want Scrapy to return from the function "hotelParse", then continue, executing the for loop.
However, now, it firsts prints all the hotel names, meaning, the for loop get executed completely, then "hotelParse" starts yielding.
This would mess up my output, once, I start assigning values to the item object.
Almost definitely what you're trying to do is the "Passing additional data to callback functions" from the Scrapy documentation. Here's how it would look for your case:
def parse_item(self, response):
for hotel in response.xpath('//div[contains(#class,"sr_item")]'):
item = HotelItem()
hotelName = hotel.xpath('.//span[contains(#class,"sr-hotel__name")]//text()')
print hotelName.extract()
item["hotelName"] = hotelName
hotel_image = hotel.xpath('.//img[contains(#class, "hotel_image")]//#src')
print hotel_image.extract()
item["hotel_image"] = hotel_image
hotelLink = hotel.xpath('.//a[contains(#class,"hotel_name_link")]//#href')
request = scrapy.Request(response.urljoin(hotelLink[0].extract()), self.parseHotel)
request.meta['item'] = item
yield request
next_page = response.xpath('//a[contains(#class,"paging-next")]//#href')
yield scrapy.Request(response.urljoin(next_page.extract()), self.parse_item)
def parseHotel(self, response):
item = response.meta['item']
item["extra_1"] = response.xpath('/example/text()').extract_first()
item["extra_2"] = response.xpath('/example2/text()').extract_first()
yield item