How to fixing different element text to extract - scrapy

I'm setting up a new scrapy spider and developed
I am using windows 10 and it's running.
My problem is extracting text from different element. This elements sometime on (strong tag, p,) sometime have class , sometime have id but i need to implement to one element to extracting a row text.
Please checkout the link of site
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=404&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193193&fromFeatured=1
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=0&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=202434
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=1218&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193194&fromFeatured=1
https://prnt.sc/nkl1vc,
https://prnt.sc/nkl1zy,
https://prnt.sc/nkl247,
# -*- coding: utf-8 -*-
import scrapy
class OtcnetSpider(scrapy.Spider):
name = 'otcnet'
# allowed_domains = ['otcnet.org']
start_urls = ['https://exhibits.otcnet.org/otc2019/Public/Exhibitors.aspx?Index=All&ID=26006&sortMenu=107000']
def parse(self, response):
links = response.css('a.exhibitorName::attr(href)').extract()
for link in links:
ab_link = response.urljoin(link)
yield scrapy.Request(ab_link, callback=self.parse_p)
def parse_p(self, response):
url = response.url
Company = response.xpath('//h1/text()').extract_first()
if Company:
Company = Company.strip()
Country = response.xpath('//*[#class="BoothContactCountry"]/text()').extract_first()
State = response.xpath('//*[#class="BoothContactState"]/text()').extract_first()
if State:
State = State.strip()
Address1 = response.xpath('//*[#class="BoothContactAdd1"]/text()').extract_first()
City = response.xpath('//*[#class="BoothContactCity"]/text()').extract_first()
if City:
City = City.strip()
zip_c = response.xpath('//*[#class="BoothContactZip"]/text()').extract_first()
Address = str(Address1)+' '+str(City)+' '+str(State)+' '+str(zip_c)
Website = response.xpath('//*[#id="BoothContactUrl"]/text()').extract_first()
Booth = response.css('.eBoothControls li:nth-of-type(1)::text').extract_first().replace('Booth: ','')
Description = ''
Products = response.css('.caption b::text').extract()
Products= ', '.join(Products)
vid_bulien = response.css('.aa-videos span.hidden-md::text').extract_first()
if vid_bulien=="Videos":
vid_bulien = "Yes"
else:
vid_bulien = "No"
Video_present = vid_bulien
Conference_link = url
Categories = response.css('.ProductCategoryLi a::text').extract()
Categories = ', '.join(Categories)
Address = Address.replace('None','')
yield {
'Company':Company,
'Country':Country,
'State':State,
'Address':Address,
'Website':Website,
'Booth':Booth,
'Description':Description,
'Products':Products,
'Video_present':Video_present,
'Conference_link':Conference_link,
'Categories':Categories
}
I expect the output would be a row description from different element

According to this post and excellent #dimitre-novatchev answer you need to find a node-set intersection:
$ns1 for your page is:
//p[#class="BoothProfile"]/following-sibling::p
$ns2 is:
p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p
as a result you need to process these p elements:
//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]
You can use this Scrapy code:
for p_elem in response.xpath('//p[#class="BoothProfile"]/following-sibling::p[count(.|//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p) = count(//p[#class="BoothProfile"]/following-sibling::div[1]/preceding-sibling::p)]'):
# using string() to stringify <p>
Description += p_elem.xpath('string(.)').extract_first()

Related

How do I connect items from one parse method to another?

'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks
I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...

I want to extract the numbers at the end of a url using regular expressions in scrapy

I want to get '25430989' from the end of this url.
https://www.example.com/cars-for-sale/2007-ford-focus-1-6-diesel/25430989
How would I write it using the xpath?
I get the link using this xpath:
link = row.xpath('.//a/#href').get()
When I use a regex tester I can isolate it with r'(\d+)$ but when I put it into my code it doesn't work for some reason.
import scrapy
import re
from ..items import DonedealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.donedeal.ie']
start_urls = ['https://www.donedeal.ie/all?source=private&sort=publishdate%20desc']
def parse(self, response):
items = DonedealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
if row.xpath('.//ul[#class="card__body-keyinfo"]/li[contains(text(),"0 min")]/text()'):
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
title = row.xpath('.//p[#class="card__body-title"]/text()').get()
county = row.xpath('.//li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
items['link'] = link
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
yield items
I'm trying to get the linkid.
The problem is here
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
When you use the .get() method it returns a string that is saved in the link variable, and strings don't have a .re() method for you to call. You can use one of the methods from the re module (docs for reference).
I would use re.findall(), it will return you a list of values that matches the regex (in this case only one item would return), or None if nothing matches. re.search() is also a good choice, but will return you an re.Match object.
import re #Don't forget to import it
...
link = row.xpath('.//a/#href').get()
linkid = re.findall(r'(\d+)$', link)
Now, the Scrapy selectors also support regex, so an alternative would be implementing it like this: (No need for re module)
linkid = row.xpath('.//a/#href').re_first(r'(\d+)$')
Notice I didn't use .get() there.

Scrapy - How to define structure of the csv export (columns,etc..)

I am quite new with scrapy, and i'm to figure out how to set the structure of a csv export. I have in the following example 2 kind of data scraped: ids and links
Here is the simple code i'm using :
class MybotSpider(scrapy.Spider):
name = 'mybot'
start_urls = ['url']
def parse(self, response):
all_dataids = response.css('li::attr(data-id)').extract()
all_links = response.xpath('//a[contains(#class, "js_clickable")]/#href').extract()
adlist = SpiderItem()
adlist['dataid'] = all_dataids
adlist['link'] = all_links
yield adlist
But my export is like this :
instead, i would like to export with for each id corresponding the href and separated in rows :
Make yielding items in this way:
def parse(self, response):
all_dataids = response.css('li::attr(data-id)').extract()
all_links = response.xpath('//a[contains(#class, "js_clickable")]/#href').extract()
for link, dataid in zip(all_links, all_dataids):
adlist = SpiderItem()
adlist['dataid'] = dataid
adlist['link'] = link
yield adlist
Here you zip your arrays to ((link, dataid), (link, dataid), (link, dataid), ...) and then yielding them one by one. So it should give you desired output.

scrapy isn't working right in extracting the title

In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.

Successively capturing data in nested pages with Scrapy

I'm trying to scrape a fairly straightforward website with a Scrapy BaseSpider since I know in advance where all of the links that I want to crawl are.
The basic layout of the site to be crawled is
List of States
List of Counties within a State
List of agencies within a County
Information about a single agency
I can successfully navigate and get data at all 4 levels, however, my county field is not being populated correctly. For a given agency, instead of the actual county it is in, I get the last county in the State the agency is located in.
Example:
OH - County #3 - Agency #1 (should be County #1)
OH - County #3 - Agency #2 (should be County #2)
OH - County #3 - Agency #3 (correct)
Can't seem to figure out something that I think is relatively simple.
Here's the code:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from agencyspider.items import AgencyItem
from scrapy.http import Request
class BasicspiderSpider(BaseSpider):
name = "basicSpider"
allowed_domains = ["usacops.com"]
start_urls = [
'http://www.usacops.com/',
]
items = {}
def parse(self, response):
sel = Selector(response)
states = sel.xpath('//comment()[.=" Begin State Names "]/following::table[1]/tr/td/a')
for s in states:
item = AgencyItem()
state = s.xpath('text()').extract()[0]
url = s.xpath('#href').extract()[0]
item['state'] = state
item['stateUrl']= url
yield Request(url=url,callback=self.parse_counties,meta={'item':item})
def parse_counties(self, response):
sel = Selector(response)
counties = sel.xpath('//comment()[.=" Begin Counties "]/following::table[1]/tr/td/font/a | //comment()[.=" Begin Counties "]/following::table[1]/tr/td/a')
for c in counties:
item = response.request.meta["item"]
county = c.xpath('text()').extract()[0]
countyUrl = c.xpath('#href').extract()[0]
url = item["stateUrl"] + countyUrl
item["county"]=county
item["countyUrl"]=url
yield Request(url=url, callback=self.parse_agencies,meta={'item':item})
def parse_agencies(self,response):
sel = Selector(response)
agencies = sel.xpath('//table[9]/tr/td/table[2]/tr/td/font/a | //table[9]/tr/td/table[2]/tr/td/a')
for a in agencies:
item = response.request.meta["item"]
agency = a.xpath('text()').extract()[0]
agencyUrl = a.xpath('#href').extract()[0]
url = item["stateUrl"] + agencyUrl
item["agency"] = agency
item["agencyUrl"] = url
yield Request(url=url, callback=self.parse_agencyinfo,meta={'item':item})
def parse_agencyinfo(self,response):
sel = Selector(response)
item = response.request.meta["item"]
item["agency"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[1]/text()').extract())
item["admintype"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[2]/text()').extract())
item["adminhead"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[3]/text()[1]').extract())
item["address"]= ' '.join(sel.xpath('//comment()[.=" Begin center section "]/following::table/tr/td/strong/font[3]/text()[position()>1]').extract())
return item
Hey so the problem is every time you assign item = response.request.meta["item"] your referencing and assigning the same item over and over again.
Fortunately its an easy fix! Just wrap response.request.meta["item"] with AgencyItem(response.request.meta["item"]) to create a copy of the state item for each county.
Also don't forget to do the same in other callbacks or else you'll have the problem with other fields. Hope that helps!