Scrapy crawl spider returns broken urls - scrapy

I am trying to create a simple crawl spider, but the response.url seem to be broken.
The code i am currently running is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from teatrorecur.items import TeatrorecurItem
class Teatrorecurspider(CrawlSpider):
name = "teatrorecurspider"
allowed_domains = ["cartelera.com.uy"]
start_urls = (
'http://www.cartelera.com.uy/apeliculafunciones.aspx?,,PELICULAS,OBRA,0,26',
)
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29', )), callback='parse_item', follow=False),
#Rule(LinkExtractor(restrict_xpaths='//a[#href="CINE%2COBRA%2C-1%2C29"]'), follow=False, callback='parse_item'),
#Rule(LinkExtractor(allow=('CINE&OBRA&-1&29$', )), callback='parse_item', follow=False),
)
def parse_item(self, response):
item = TeatrorecurItem()
item['url']=response.url
yield item
a sample url i'm getting from this code is
<200 http://www.cartelera.com.uy/apeliculafunciones.aspx?-1=&12415=&29=&CINE=&OBRA=>
but the corresponding element in the page has the following href value
<a href="http://www.cartelera.com.uy/apeliculafunciones.aspx?12415&&CINE&OBRA&-1&29">
as you can see, the string following the .aspx? is messed up, i have no clue what is wrong.

LinkExtractor has a option named canonicalize that defaults to True.
Set it to False like so:
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29',), canonicalize=False), callback='parse_item', follow=False),
)
This will prevent LinkExtractor from performing changes to the url described at the def of canonicalize_url.

Related

Scrapy CrawlSpider: Getting data before extracting link

In CrawlSpider, how can I scrape the marked field "4 days ago" in the image before extracting each link?
The below-mentioned CrawlSpider is working fine. But in 'parse_item' I want to add a new field named 'Add posted' where I want to get the field marked on the image.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}
When using the Rule object of the scrapy crawl spider, the extracted link's text is saved in a meta field of the request named link_text. You can obtain this value in the parse_item method and extract the time information using regex. You can read more about it from the docs. See below example.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}
To show in a loop, you can use the following xpath to receive that data point:
x = response.xpath('//div[#class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }

scrapy list of links from parse result

Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.

scrapy yield Request not working

I wrote the following scrapy spider but it's not continuing the crawling process after the initial request, although I've yielded more scrapy.Requests for scrapy to follow.
import regex as re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
class myspider(Spider):
name = 'haha'
allowed_domains = ['https://blog.scrapinghub.com/']
start_urls = ['https://blog.scrapinghub.com/']
extractor = LinkExtractor(allow=allowed_domains)
def parse(self, response):
# To extract all the links on this page
links_in_page = self.extractor.extract_links(response)
for link in links_in_page:
yield scrapy.Request(link.url, callback=self.parse)
allowed_domains needs to be a list of domains, not a list of URLs.
So it should be:
allowed_domains = ['blog.scrapinghub.com']

How can I get an output in UTF-8 encoded unicode from Scrapy?

Bear with me. I'm writing every detail because so many parts of the toolchain do not handle Unicode gracefully and it's not clear what is failing.
PRELUDE
We first set up and use a recent Scrapy.
source ~/.scrapy_1.1.2/bin/activate
Since the terminal's default is ascii, not unicode, we set:
export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
Also since by default Python uses ascii, we modify the encoding:
export PYTHONIOENCODING="utf_8"
Now we're ready to start a Scrapy project.
scrapy startproject myproject
cd myproject
scrapy genspider dorf PLACEHOLDER
We're told we now have a spider.
Created spider 'dorf' using template 'basic' in module:
myproject.spiders.dorf
We modify myproject/items.py to be:
# -*- coding: utf-8 -*-
import scrapy
class MyprojectItem(scrapy.Item):
title = scrapy.Field()
ATTEMPT 1
Now we write the spider, relying on urllib.unquote
# -*- coding: utf-8 -*-
import scrapy
import urllib
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = urllib.unquote(
response.xpath('//title').extract_first().encode('ascii')
).decode('utf8')
return item
And finally we use a custom item exporter (from all the way back in Oct 2011)
# -*- coding: utf-8 -*-
import json
from scrapy.exporters import BaseItemExporter
class UnicodeJsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs)
self.file = file
self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(self.encoder.encode(itemdict) + '\n')
and add
FEED_EXPORTERS = {
'json': 'myproject.exporters.UnicodeJsonLinesItemExporter',
}
to myproject/settings.py.
Now we run
~/myproject> scrapy crawl dorf -o dorf.json -t json
we get
UnicodeEncodeError: 'ascii' codec can't encode character u'\xfc' in position 25: ordinal not in range(128)
ATTEMPT 2
Another solution (the candidate solution for Scrapy 1.2?) is to use the spider
# -*- coding: utf-8 -*-
import scrapy
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = response.xpath('//title')[0].extract()
return item
and the custom item exporter
# -*- coding: utf-8 -*-
from scrapy.exporters import JsonItemExporter
class Utf8JsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
super(Utf8JsonItemExporter, self).__init__(
file, ensure_ascii=False, **kwargs)
with
FEED_EXPORTERS = {
'json': 'myproject.exporters.Utf8JsonItemExporter',
}
in myproject/settings.py.
We get the following JSON file.
[
{"title": "<title>Sister cities of D\u00fcsseldorf \u2014 sistercity.info</title>"}
]
The Unicode is not UTF-8 encoded. Although this is a trivial problem for a couple of characters, it becomes a serious issue if the entire output is in a foreign language.
How can I get an output in UTF-8 unicode?
In Scrapy 1.2+ there is a FEED_EXPORT_ENCODING option. When FEED_EXPORT_ENCODING = "utf-8" escaping of non-ascii symbols in JSON output is turned off.
please try this on your Attempt 1 and let me know if it works (I've test it without setting all those env. variables)
def to_write(uni_str):
return urllib.unquote(uni_str.encode('utf8')).decode('utf8')
class CitiesSpider(scrapy.Spider):
name = "cities"
allowed_domains = ["sitercity.info"]
start_urls = (
'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
for i in range(2):
item = SimpleItem()
item['title'] = to_write(response.xpath('//title').extract_first())
item['url'] = to_write(response.url)
yield item
the range(2) is for testing the json exporter, to get a list of dicts you can do this instead:
# -*- coding: utf-8 -*-
from scrapy.contrib.exporter import JsonItemExporter
from scrapy.utils.serialize import ScrapyJSONEncoder
class UnicodeJsonLinesItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
self.encoder = ScrapyJSONEncoder(ensure_ascii=False, **kwargs)
self.first_item = True

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko