Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.
Related
In CrawlSpider, how can I scrape the marked field "4 days ago" in the image before extracting each link?
The below-mentioned CrawlSpider is working fine. But in 'parse_item' I want to add a new field named 'Add posted' where I want to get the field marked on the image.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}
When using the Rule object of the scrapy crawl spider, the extracted link's text is saved in a meta field of the request named link_text. You can obtain this value in the parse_item method and extract the time information using regex. You can read more about it from the docs. See below example.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}
To show in a loop, you can use the following xpath to receive that data point:
x = response.xpath('//div[#class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }
I'm trying to scrape a table with multiple pages. With the following code I print the first page data:
import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators
class mySpider(scrapy.Spider):
name = "education2"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
def parse(self, response):
return Request(
url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
callback=self.parse_table
)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
I have written the next code to download all the pages. It is based on other posts that I have read:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
When I try to print all the pages I don't obtain anything. Can anyone help me to know what is the mistake?
Scrapy needs parse callback first. Scrapy doc
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse", follow= True),)
def parse(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
or just rewrite start_request method with other callback:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
Here is a code to crawl all pages:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
from w3lib.url import add_or_replace_parameter
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
def parse(self, response):
max_page = int(response.xpath('//*[#id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
for page in range(1, max_page + 1):
yield Request(
url=add_or_replace_parameter(self.api_url, 'Page', page),
callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
I wrote the following scrapy spider but it's not continuing the crawling process after the initial request, although I've yielded more scrapy.Requests for scrapy to follow.
import regex as re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
class myspider(Spider):
name = 'haha'
allowed_domains = ['https://blog.scrapinghub.com/']
start_urls = ['https://blog.scrapinghub.com/']
extractor = LinkExtractor(allow=allowed_domains)
def parse(self, response):
# To extract all the links on this page
links_in_page = self.extractor.extract_links(response)
for link in links_in_page:
yield scrapy.Request(link.url, callback=self.parse)
allowed_domains needs to be a list of domains, not a list of URLs.
So it should be:
allowed_domains = ['blog.scrapinghub.com']
I am trying to create a simple crawl spider, but the response.url seem to be broken.
The code i am currently running is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from teatrorecur.items import TeatrorecurItem
class Teatrorecurspider(CrawlSpider):
name = "teatrorecurspider"
allowed_domains = ["cartelera.com.uy"]
start_urls = (
'http://www.cartelera.com.uy/apeliculafunciones.aspx?,,PELICULAS,OBRA,0,26',
)
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29', )), callback='parse_item', follow=False),
#Rule(LinkExtractor(restrict_xpaths='//a[#href="CINE%2COBRA%2C-1%2C29"]'), follow=False, callback='parse_item'),
#Rule(LinkExtractor(allow=('CINE&OBRA&-1&29$', )), callback='parse_item', follow=False),
)
def parse_item(self, response):
item = TeatrorecurItem()
item['url']=response.url
yield item
a sample url i'm getting from this code is
<200 http://www.cartelera.com.uy/apeliculafunciones.aspx?-1=&12415=&29=&CINE=&OBRA=>
but the corresponding element in the page has the following href value
<a href="http://www.cartelera.com.uy/apeliculafunciones.aspx?12415&&CINE&OBRA&-1&29">
as you can see, the string following the .aspx? is messed up, i have no clue what is wrong.
LinkExtractor has a option named canonicalize that defaults to True.
Set it to False like so:
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29',), canonicalize=False), callback='parse_item', follow=False),
)
This will prevent LinkExtractor from performing changes to the url described at the def of canonicalize_url.
here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko