I am quite new with scrapy, and i'm to figure out how to set the structure of a csv export. I have in the following example 2 kind of data scraped: ids and links
Here is the simple code i'm using :
class MybotSpider(scrapy.Spider):
name = 'mybot'
start_urls = ['url']
def parse(self, response):
all_dataids = response.css('li::attr(data-id)').extract()
all_links = response.xpath('//a[contains(#class, "js_clickable")]/#href').extract()
adlist = SpiderItem()
adlist['dataid'] = all_dataids
adlist['link'] = all_links
yield adlist
But my export is like this :
instead, i would like to export with for each id corresponding the href and separated in rows :
Make yielding items in this way:
def parse(self, response):
all_dataids = response.css('li::attr(data-id)').extract()
all_links = response.xpath('//a[contains(#class, "js_clickable")]/#href').extract()
for link, dataid in zip(all_links, all_dataids):
adlist = SpiderItem()
adlist['dataid'] = dataid
adlist['link'] = link
yield adlist
Here you zip your arrays to ((link, dataid), (link, dataid), (link, dataid), ...) and then yielding them one by one. So it should give you desired output.
Related
I'm trying to extract some fields from start_url, and want to add the PDF link fields that are obtained from each URL that has been obtained. I tried Scrapy but no lucky to add PDF fields. Here is my code,
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
#pass
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
i = 0
for a in range(total_url):
title = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
url_source = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
thumbnail = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
pdf = scrapy.Request(book_urls[i], self.find_details)
yield {
'Book Title': title,
'URL': url_source,
'Mini IMG': thumbnail,
'PDF Link': pdf
}
i+=1
def find_details(self, response):
# find PDF link
pdf = response.xpath("//div[#class='td-post-content']//a/#href").get()
return pdf
How do I add a PDF link field correctly when I export it as CSV? Thanks in advance
pdf = scrapy.Request(book_urls[i], self.find_details)
It means pdf variable is a request.
Scrapy is asynchronous so you'll have trouble to get a return value from a function. Just make a request and pass the details to the callback with cb_kwargs.
import scrapy
class MybookSpider(scrapy.Spider):
name = 'mybooks'
allowed_domains = ['gln.kemdikbud.go.id']
start_urls = ['https://gln.kemdikbud.go.id/glnsite/category/modul-gls/page/1/']
def parse(self, response):
# gathering all links
book_urls = response.xpath("//div[#class='td-module-thumb']//a/#href").getall()
total_url = len(book_urls)
for i in range(total_url):
item = dict()
item['title'] = response.xpath("//h3[#class='entry-title td-module-title']//a/text()")[i].extract()
item['url_source'] = response.xpath("//div[#class='td-module-thumb']//a/#href")[i].get()
item['thumbnail'] = response.xpath('//*[#class="td-block-span4"]//*[has-class("entry-thumb")]//#src')[i].extract()
yield scrapy.Request(url=book_urls[i], callback=self.find_details, cb_kwargs={'item': item})
def find_details(self, response, item):
# find PDF link
item['pdf'] = response.xpath("//div[#class='td-post-content']//a/#href").get()
yield item
'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks
I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...
I want to get '25430989' from the end of this url.
https://www.example.com/cars-for-sale/2007-ford-focus-1-6-diesel/25430989
How would I write it using the xpath?
I get the link using this xpath:
link = row.xpath('.//a/#href').get()
When I use a regex tester I can isolate it with r'(\d+)$ but when I put it into my code it doesn't work for some reason.
import scrapy
import re
from ..items import DonedealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.donedeal.ie']
start_urls = ['https://www.donedeal.ie/all?source=private&sort=publishdate%20desc']
def parse(self, response):
items = DonedealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
if row.xpath('.//ul[#class="card__body-keyinfo"]/li[contains(text(),"0 min")]/text()'):
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
title = row.xpath('.//p[#class="card__body-title"]/text()').get()
county = row.xpath('.//li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
items['link'] = link
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
yield items
I'm trying to get the linkid.
The problem is here
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
When you use the .get() method it returns a string that is saved in the link variable, and strings don't have a .re() method for you to call. You can use one of the methods from the re module (docs for reference).
I would use re.findall(), it will return you a list of values that matches the regex (in this case only one item would return), or None if nothing matches. re.search() is also a good choice, but will return you an re.Match object.
import re #Don't forget to import it
...
link = row.xpath('.//a/#href').get()
linkid = re.findall(r'(\d+)$', link)
Now, the Scrapy selectors also support regex, so an alternative would be implementing it like this: (No need for re module)
linkid = row.xpath('.//a/#href').re_first(r'(\d+)$')
Notice I didn't use .get() there.
I'm setting up a new scrapy spider and developed
I am using windows 10 and it's running.
My problem is extracting text from different element. This elements sometime on (strong tag, p,) sometime have class , sometime have id but i need to implement to one element to extracting a row text.
Please checkout the link of site
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=404&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193193&fromFeatured=1
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=0&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=202434
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=1218&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193194&fromFeatured=1
https://prnt.sc/nkl1vc,
https://prnt.sc/nkl1zy,
https://prnt.sc/nkl247,
# -*- coding: utf-8 -*-
import scrapy
class OtcnetSpider(scrapy.Spider):
name = 'otcnet'
# allowed_domains = ['otcnet.org']
start_urls = ['https://exhibits.otcnet.org/otc2019/Public/Exhibitors.aspx?Index=All&ID=26006&sortMenu=107000']
def parse(self, response):
links = response.css('a.exhibitorName::attr(href)').extract()
for link in links:
ab_link = response.urljoin(link)
yield scrapy.Request(ab_link, callback=self.parse_p)
def parse_p(self, response):
url = response.url
Company = response.xpath('//h1/text()').extract_first()
if Company:
Company = Company.strip()
Country = response.xpath('//*[#class="BoothContactCountry"]/text()').extract_first()
State = response.xpath('//*[#class="BoothContactState"]/text()').extract_first()
if State:
State = State.strip()
Address1 = response.xpath('//*[#class="BoothContactAdd1"]/text()').extract_first()
City = response.xpath('//*[#class="BoothContactCity"]/text()').extract_first()
if City:
City = City.strip()
zip_c = response.xpath('//*[#class="BoothContactZip"]/text()').extract_first()
Address = str(Address1)+' '+str(City)+' '+str(State)+' '+str(zip_c)
Website = response.xpath('//*[#id="BoothContactUrl"]/text()').extract_first()
Booth = response.css('.eBoothControls li:nth-of-type(1)::text').extract_first().replace('Booth: ','')
Description = ''
Products = response.css('.caption b::text').extract()
Products= ', '.join(Products)
vid_bulien = response.css('.aa-videos span.hidden-md::text').extract_first()
if vid_bulien=="Videos":
vid_bulien = "Yes"
else:
vid_bulien = "No"
Video_present = vid_bulien
Conference_link = url
Categories = response.css('.ProductCategoryLi a::text').extract()
Categories = ', '.join(Categories)
Address = Address.replace('None','')
yield {
'Company':Company,
'Country':Country,
'State':State,
'Address':Address,
'Website':Website,
'Booth':Booth,
'Description':Description,
'Products':Products,
'Video_present':Video_present,
'Conference_link':Conference_link,
'Categories':Categories
}
I expect the output would be a row description from different element
According to this post and excellent #dimitre-novatchev answer you need to find a node-set intersection:
$ns1 for your page is:
//p[#class="BoothProfile"]/following-sibling::p
$ns2 is:
p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p
as a result you need to process these p elements:
//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]
You can use this Scrapy code:
for p_elem in response.xpath('//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]'):
# using string() to stringify <p>
Description += p_elem.xpath('string(.)').extract_first()
In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.