How to scrape multiple pages with scrapy? - scrapy

I'm trying to scrape a table with multiple pages. With the following code I print the first page data:
import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators
class mySpider(scrapy.Spider):
name = "education2"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
def parse(self, response):
return Request(
url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
callback=self.parse_table
)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
I have written the next code to download all the pages. It is based on other posts that I have read:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
When I try to print all the pages I don't obtain anything. Can anyone help me to know what is the mistake?

Scrapy needs parse callback first. Scrapy doc
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse", follow= True),)
def parse(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
or just rewrite start_request method with other callback:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
Here is a code to crawl all pages:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
from w3lib.url import add_or_replace_parameter
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
def parse(self, response):
max_page = int(response.xpath('//*[#id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
for page in range(1, max_page + 1):
yield Request(
url=add_or_replace_parameter(self.api_url, 'Page', page),
callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item

Related

scrapy list of links from parse result

Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.

scrapy yield Request not working

I wrote the following scrapy spider but it's not continuing the crawling process after the initial request, although I've yielded more scrapy.Requests for scrapy to follow.
import regex as re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
class myspider(Spider):
name = 'haha'
allowed_domains = ['https://blog.scrapinghub.com/']
start_urls = ['https://blog.scrapinghub.com/']
extractor = LinkExtractor(allow=allowed_domains)
def parse(self, response):
# To extract all the links on this page
links_in_page = self.extractor.extract_links(response)
for link in links_in_page:
yield scrapy.Request(link.url, callback=self.parse)
allowed_domains needs to be a list of domains, not a list of URLs.
So it should be:
allowed_domains = ['blog.scrapinghub.com']

Spider to download images does not seem to work though i have pil installed

I am a newbie to scrapy.I am trying to write a spider to download images.for using the image pipeline,is installing PIL sufficient?My PIL is located in
/usr/lib/python2.7/dist-packages/PIL
How do I include it in my Scrapy project?
settings file:
BOT_NAME = 'paulsmith'
BOT_VERSION = '1.0'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGE_STORE = '/home/jay/Scrapy/paulsmith/images'
SPIDER_MODULES = ['paulsmith.spiders']
NEWSPIDER_MODULE = 'paulsmith.spiders'
DEFAULT_ITEM_CLASS = 'paulsmith.items.PaulsmithItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
Items file:
from scrapy.item import Item, Field
class PaulsmithItem(Item):
image_urls=Field()
image = Field()
pass
Spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from paulsmith.items import PaulsmithItem
class PaulSmithSpider(BaseSpider):
name="Paul"
allowed_domains=["http://www.paulsmith.co.uk/uk-en/shop/mens"]
start_urls=["http://www.paulsmith.co.uk/uk-en/shop/mens/jeans"]
def parse(self,response):
item= PaulsmithItem()
#open('paulsmith.html','wb').write(response.body)
hxs=HtmlXPathSelector(response)
#sites=hxs.select('//div[#class="category-products"]')
item['image_urls']=hxs.select("//div[#class='category-products']//a/img/#src").extract()
#for site in sites:
#print site.extract()
#image = site.select('//a/img/#src').extract()
return item
SPIDER = PaulSmithSpider()
You may have not set IMAGES_STORE = '/path/to/valid/dir'
morever, try to use a Custom Images pipeline like this:
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
You can check whether image_urls are requested from method "get_media_requests"
reference: http://doc.scrapy.org/en/latest/topics/images.html

Scrapy issue with csv output

Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.
Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko