I am currently building my first scrapy project. Currently I am trying to extract data from a HTML table. Here is my crawl spider so far:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from digikey.items import DigikeyItem
from scrapy.selector import Selector
class DigikeySpider(CrawlSpider):
name = 'digikey'
allowed_domains = ['digikey.com']
start_urls = ['https://www.digikey.com/products/en/capacitors/aluminum-electrolytic-capacitors/58/page/3?stock=1']
['www.digikey.com/products/en/capacitors/aluminum-electrolytic-capacitors/58/page/4?stock=1']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(LinkExtractor(allow=('/products/en/capacitors/aluminum-electrolytic-capacitors/58/page/3?stock=1', ), deny=('subsection\.php', ))),
)
def parse_item(self, response):
item = DigikeyItem()
item['partnumber'] = response.xpath('//td[#class="tr-mfgPartNumber"]/a/span[#itemprop="name"]/text()').extract()
item['manufacturer'] = response.xpath('///td[6]/span/a/span/text()').extract()
item['description'] = response.xpath('//td[#class="tr-description"]/text()').extract()
item['quanity'] = response.xpath('//td[#class="tr-qtyAvailable ptable-param"]//text()').extract()
item['price'] = response.xpath('//td[#class="tr-unitPrice ptable-param"]/text()').extract()
item['minimumquanity'] = response.xpath('//td[#class="tr-minQty ptable-param"]/text()').extract()
yield item
parse_start_url = parse_item
It scrapes the table at www.digikey.com/products/en/capacitors/aluminum-electrolytic-capacitors/58/page/4?stock=1. It then exports all data to a digikey.csv file but all data is in one cell.
Csv file with scraped data in one cell
setting.py
BOT_NAME = 'digikey'
SPIDER_MODULES = ['digikey.spiders']
NEWSPIDER_MODULE = 'digikey.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'digikey ("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36")'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
I want the information scraped with one line at a time with the corresponding information associated with that partnumber.
items.py
import scrapy
class DigikeyItem(scrapy.Item):
partnumber = scrapy.Field()
manufacturer = scrapy.Field()
description = scrapy.Field()
quanity= scrapy.Field()
minimumquanity = scrapy.Field()
price = scrapy.Field()
pass
Any help is much appreciated!
The problem is that you're loading into each field of a single item the whole columns. I feel that what you want is something like:
for row in response.css('table#productTable tbody tr'):
item = DigikeyItem()
item['partnumber'] = (row.css('.tr-mfgPartNumber [itemprop="name"]::text').extract_first() or '').strip()
item['manufacturer'] = (row.css('[itemprop="manufacture"] [itemprop="name"]::text').extract_first() or '').strip()
item['description'] = (row.css('.tr-description::text').extract_first() or '').strip()
item['quanity'] = (row.css('.tr-qtyAvailable::text').extract_first() or '').strip()
item['price'] = (row.css('.tr-unitPrice::text').extract_first() or '').strip()
item['minimumquanity'] = (row.css('.tr-minQty::text').extract_first() or '').strip()
yield item
I've changed a bit the selectors to try to make it shorter. btw please avoid the manual extract_first and strip repetitions that I've used here (just for testing purposes), and consider using Item Loaders, it should be easier to take the first and strip/format the desired output.
Related
I am trying to get within several urls of a webpage and follow the response to the next parser to grab another set of urls on a page. However, from this page I need to grab the next page urls but I wanted to try this by manipulating the page string by parsing it and then passing this as the next page. However, the scraper crawls but it returns nothing not even the output on the final parser when I load item.
Note: I know that I can grab the next page rather simply with an if-statement on the href. However, I wanted to try something different in case I had to face a situation where I would have to do this.
Here's my scraper:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[#class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/#href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[#class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[#class="listing-results-right clearfix"]//a)[position() mod 3=1]//#href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[#id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[#id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
I know the way of grabbing the urls works as I have tried it on a single url using the following:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
Which is the same as the one used above just a change of variables. this outputs the following links and they work:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
Your use case is suited for using scrapy crawl spider. You can write rules on how to extract links to the properties and how to extract links to the next pages. I have changed your code to use a crawl spider class and I have changed your FEEDS settings to use the recommended settings. FEED_URI and FEED_FORMAT are deprecated in newer versions of scrapy.
Read more about the crawl spider from the docs
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(#class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[#class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
I am using a ScrapingHub API, and am using shub, to deploy my project. However, the items result is in as shown:
Unfortunately, I need it in the following order --> Title, Publish Date, Description, Link. How can I get the output to be in exactly that order for every item class?
Below is a short sample of my spider:
import scrapy
from scrapy.spiders import XMLFeedSpider
from tickers.items import tickersItem
class Spider(XMLFeedSpider):
name = "Scraper"
allowed_domains = ["yahoo.com"]
start_urls = ('https://feeds.finance.yahoo.com/rss/2.0/headline?s=ABIO,ACFN,AEMD,AEZS,AITB,AJX,AU,AKERMN,AUPH,AVL,AXPW
'https://feeds.finance.yahoo.com/rss/2.0/headline?s=DRIO
'https://feeds.finance.yahoo.com/rss/2.0/headline?s=IDXG,IMMU,IMRN,IMUC,INNV,INVT,IPCI,INPX,JAGX,KDMN,KTOV,LQMT
)
itertag = 'item'
def parse_node(self, response, node):
item = {}
item['Title'] = node.xpath('title/text()',).extract_first()
item['Description'] = node.xpath('description/text()').extract_first()
item['Link'] = node.xpath('link/text()').extract_first()
item['PublishDate'] = node.xpath('pubDate/text()').extract_first()
return item
Additionally, here is my attached items.py file, it is in the same order as my spider, so I have no idea why the output is not in order.
Items.py:
import scrapy
class tickersItem(scrapy.Item):
Title = scrapy.Field()
Description = scrapy.Field()
Link = scrapy.Field()
PublishDate = scrapy.Field()
The syntax of my code is in order for both the items and the spider file, and I have no idea how to fix it. I am a new python programmer.
Instead of defining items in items.py, you could use collections.OrderedDict. Just import collections module and in parse_node method, change the line:
item = {}
to line:
item = collections.OrderedDict()
Or, if you want defined items, you could use approach outlined in this answer. Your items.py would then contain this code:
from collections import OrderedDict
from scrapy import Field, Item
import six
class OrderedItem(Item):
def __init__(self, *args, **kwargs):
self._values = OrderedDict()
if args or kwargs: # avoid creating dict for most common case
for k, v in six.iteritems(dict(*args, **kwargs)):
self[k] = v
class tickersItem(OrderedItem):
Title = Field()
Description = Field()
Link = Field()
PublishDate = Field()
You should then also modify your spider code to use this item, accordingly. Refer to the documentation.
I want crawl image of link : "http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html" but but the code is only crawl a picture (in my computer) and crawl all pictures (in my friend computer). Plese please help me
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from imgur.items import ImgurItem
class ImgurSpider(CrawlSpider):
name = 'imgur'
allowed_domains = ['vnexpress.net']
start_urls = ['http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html']
# rules = [Rule(LinkExtractor(allow=['/*']), 'parse123')]
def parse(self, response):
image = ImgurItem()
# image['title'] = response.xpath(\
# "//img[data-notes-url=""]").extract()
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
return image
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
You take only one link by specifying [0] index.
Try
image['image_urls'] = rel
You can also split your code to url parsing function, and a callback for downloading the images.
I want my spider to scrape the listings on every page of a website. I used CrawlSpider and LinkExtractor. But when I looked at the csv file, nothing on the first page (i.e. start url) was scraped. The scraped items started from page 2. I tested my crawler on the Scrapy shell and it seemed fine. I can't figure out where the problem lies. Below is my spider code. Please help. Thanks a lot!
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from shputuo.items_shputuo import ShputuoItem
class Shputuo(CrawlSpider):
name = "shputuo"
allowed_domains = ["shpt.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://www.shpt.gov.cn/gb/n6132/n6134/n6156/n7110/n7120/index.html"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=("//div[#class = 'page']/ul/li[5]/a",)), callback="parse_items", follow= True),
)
def parse_items(self, response):
for sel in response.xpath("//div[#class = 'neirong']/ul/li"):
item = ShputuoItem()
word = sel.xpath("a/text()").extract()[0]
item['id'] = word[3:11]
item['title'] = word[11:len(word)]
item['link'] = "http://www.shpt.gov.cn" + sel.xpath("a/#href").extract()[0]
item['time2'] = sel.xpath("span/text()").extract()[0][1:11]
request = scrapy.Request(item['link'], callback = self.parse_content)
request.meta['item'] = item
yield request
def parse_content(self, response):
item = response.meta['item']
item['question'] = response.xpath("//div[#id = 'ivs_content']/p[2]/text()").extract()[0]
item['question'] = "".join(map(unicode.strip, item['question'])) # get rid of unwated spaces and others
item['reply'] = response.xpath("//div[#id = 'ivs_content']/p[3]/text()").extract()[0]
item['reply'] = "".join(map(unicode.strip, item['reply']))
item['agency'] = item['reply'][6:10]
item['time1'] = "2015-" + item['question'][0] + "-" + item['question'][2]
yield item
looks like what you really need to do is to parse elements of the start_urls requests and not to only follow the rules.
For that use the parse_start_url method which is the callback by default of the start_urls requests.
In this code I want to scrape title,subtitle and data inside the links but having issues on pages beyond 1 and 2 as getting only 1 item scraped.I want to extract only those entries having title as delhivery only
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from delhivery.items import DelhiveryItem
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery&page=2"]
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = DelhiveryItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span[#style="background-color:yellow"]/text()').extract()[0]
#item['title'] = site.xpath('.//td[#class="complaint"]/a[text() = "%s Delivery Courier %s"]/text()').extract()[0]
item['subtitle'] = site.xpath('.//td[#class="compl-text"]/div/b[1]/text()').extract()[0]
item['date'] = site.xpath('.//td[#class="small"]/text()').extract()[0].strip()
item['username'] = site.xpath('.//td[#class="small"]/a[2]/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#style="padding-bottom:15px"]/div/text()').extract()[0]
yield old_item
You need to change the item['title'] to this:
item['title'] = ''.join(site.xpath('//table[#width="100%"]//span[text() = "Delhivery"]/parent::*//text()').extract()[0])
Also edit sites to this to extract the required links only (ones with Delhivery in it)
sites = response.xpath('//table//span[text()="Delhivery"]/ancestor::div')
EDIT:
so I understand now that you need to add a pagination rule to your code.
it should be something like this:
You just need to add your imports and write the new xpaths from the item's link itself, such as this one
class criticspider(CrawlSpider):
name = "delh"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
# Extracting pages, allowing only links with page=number to be extracted
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[#class="pagelinks"]', ), allow=('page=\d+', ),unique=True),follow=True),
# Extract links of items on each page the spider gets from the first rule
Rule(SgmlLinkExtractor(restrict_xpaths=('//td[#class="complaint"]', )), callback='parse_item'),
)
def parse_item(self, response):
item = DelhiveryItem()
#populate item object here the same way you did, this function will be called for each item link.
#This meand that you'll be extracting data from pages like this one :
#http://www.consumercomplaints.in/complaints/delhivery-last-mile-courier-service-poor-delivery-service-c772900.html#c1880509
item['title'] = response.xpath('<write xpath>').extract()[0]
item['subtitle'] = response.xpath('<write xpath>').extract()[0]
item['date'] = response.xpath('<write xpath>').extract()[0].strip()
item['username'] = response.xpath('<write xpath>').extract()[0]
item['link'] = response.url
item['data'] = response.xpath('<write xpath>').extract()[0]
yield item
Also I suggest when you write an xpath, that you don't use any styling parameters, try to use #class or #id, only use #width, #style or any styling params if it's the only way.