I am having issues fetching the start_urls for Scrapy to run from a table in MySQL.
MySQL has a database called "scrapy", a table called "urls" with a single column called "url" which has on each row a URL to scrape.
This is the code I have currently, but I feel like I am missing a concept somewhere:
# -*- coding: utf-8 -*-
import scrapy
import datetime
import urlparse
import socket
import MySQLdb
from scrapy.loader import ItemLoader
from example.items import exampleitem
class instantdbSpider(scrapy.Spider):
name = 'instantdb'
allowed_domains = ['example.com']
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='Password!',
db='scrapy',
host='localhost',
charset="utf8",
use_unicode=True,
cursorclass=MySQLdb.cursors.DictCursor
)
cursor = conn.cursor()
cursor.execute('SELECT * FROM urls')
rows = cursor.fetchall()
for row in rows:
url = row["url"]
yield Request(url=url)
def parse(self, response):
l = ItemLoader(item=exampleitem(), response=response)
#Scrape Fields
l.add_xpath('title', '//html/head/title/text()')
l.add_xpath('sku', '//*[#id="js-zoom-image-container"]/div[5]/h2/strong/text()')
l.add_xpath('price', '//*[#id="main-content"]/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/p/text()[1]')
l.add_xpath('product_title', '//html/body/div[1]/span[4]/text()')
l.add_xpath('image_url', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/a/img/#src')
l.add_xpath('description', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[7]')
# Administration Fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()
Any help would be greatly appreciated as I seem to be going around in circles. Thanks.
You have an indentation issue:
for row in rows:
url = row["url"]
yield Request(url=url)
Related
# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.response import open_in_browser
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
#from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
class ExampleSpider(scrapy.Spider):
name = 'forever'
allowed_domains = ['example.com']
kohlind = max_kohls = 0
total_products = 0
colected = 0
items = []
#AWS_ACCESS_KEY_ID = 'id'
#AWS_SECRET_ACCESS_KEY = 'pass'
start_urls=['https://www.example.com/']
custom_settings = {'FEED_URI' : f's3://example-products/fulltest.csv',
'FEED_EXPORT_FIELDS': ['ITEM_ID','URL','SELLER','PRICE','SALE_PRICE','MAIN_IMAGE','OTHER_IMAGE','SKU','PRODUCT_NAME']
}
def __init__(self):
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
#spider code
def _close(self):
print(f"\n\nClosing Spider with {len(self.items)} New Items")
for i in self.items:
if "URL" in i.keys():
yield item
print("Done")
Program is not connecting to _close function, no error is found and the yield items in spider code are uploaded normally (except the _close nothing happens)
I tried removing the s3 in the settings, It worked fine (i.e entered the _close function)
How can I fix?
Try this code below, and it should work
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(scrapy.Spider):
name = 'forever'
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print(f"\n\nClosing Spider with {len(self.items)} New Items")
Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.
Bear with me. I'm writing every detail because so many parts of the toolchain do not handle Unicode gracefully and it's not clear what is failing.
PRELUDE
We first set up and use a recent Scrapy.
source ~/.scrapy_1.1.2/bin/activate
Since the terminal's default is ascii, not unicode, we set:
export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
Also since by default Python uses ascii, we modify the encoding:
export PYTHONIOENCODING="utf_8"
Now we're ready to start a Scrapy project.
scrapy startproject myproject
cd myproject
scrapy genspider dorf PLACEHOLDER
We're told we now have a spider.
Created spider 'dorf' using template 'basic' in module:
myproject.spiders.dorf
We modify myproject/items.py to be:
# -*- coding: utf-8 -*-
import scrapy
class MyprojectItem(scrapy.Item):
title = scrapy.Field()
ATTEMPT 1
Now we write the spider, relying on urllib.unquote
# -*- coding: utf-8 -*-
import scrapy
import urllib
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = urllib.unquote(
response.xpath('//title').extract_first().encode('ascii')
).decode('utf8')
return item
And finally we use a custom item exporter (from all the way back in Oct 2011)
# -*- coding: utf-8 -*-
import json
from scrapy.exporters import BaseItemExporter
class UnicodeJsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs)
self.file = file
self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(self.encoder.encode(itemdict) + '\n')
and add
FEED_EXPORTERS = {
'json': 'myproject.exporters.UnicodeJsonLinesItemExporter',
}
to myproject/settings.py.
Now we run
~/myproject> scrapy crawl dorf -o dorf.json -t json
we get
UnicodeEncodeError: 'ascii' codec can't encode character u'\xfc' in position 25: ordinal not in range(128)
ATTEMPT 2
Another solution (the candidate solution for Scrapy 1.2?) is to use the spider
# -*- coding: utf-8 -*-
import scrapy
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = response.xpath('//title')[0].extract()
return item
and the custom item exporter
# -*- coding: utf-8 -*-
from scrapy.exporters import JsonItemExporter
class Utf8JsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
super(Utf8JsonItemExporter, self).__init__(
file, ensure_ascii=False, **kwargs)
with
FEED_EXPORTERS = {
'json': 'myproject.exporters.Utf8JsonItemExporter',
}
in myproject/settings.py.
We get the following JSON file.
[
{"title": "<title>Sister cities of D\u00fcsseldorf \u2014 sistercity.info</title>"}
]
The Unicode is not UTF-8 encoded. Although this is a trivial problem for a couple of characters, it becomes a serious issue if the entire output is in a foreign language.
How can I get an output in UTF-8 unicode?
In Scrapy 1.2+ there is a FEED_EXPORT_ENCODING option. When FEED_EXPORT_ENCODING = "utf-8" escaping of non-ascii symbols in JSON output is turned off.
please try this on your Attempt 1 and let me know if it works (I've test it without setting all those env. variables)
def to_write(uni_str):
return urllib.unquote(uni_str.encode('utf8')).decode('utf8')
class CitiesSpider(scrapy.Spider):
name = "cities"
allowed_domains = ["sitercity.info"]
start_urls = (
'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
for i in range(2):
item = SimpleItem()
item['title'] = to_write(response.xpath('//title').extract_first())
item['url'] = to_write(response.url)
yield item
the range(2) is for testing the json exporter, to get a list of dicts you can do this instead:
# -*- coding: utf-8 -*-
from scrapy.contrib.exporter import JsonItemExporter
from scrapy.utils.serialize import ScrapyJSONEncoder
class UnicodeJsonLinesItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
self.encoder = ScrapyJSONEncoder(ensure_ascii=False, **kwargs)
self.first_item = True
I am trying to create a simple crawl spider, but the response.url seem to be broken.
The code i am currently running is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from teatrorecur.items import TeatrorecurItem
class Teatrorecurspider(CrawlSpider):
name = "teatrorecurspider"
allowed_domains = ["cartelera.com.uy"]
start_urls = (
'http://www.cartelera.com.uy/apeliculafunciones.aspx?,,PELICULAS,OBRA,0,26',
)
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29', )), callback='parse_item', follow=False),
#Rule(LinkExtractor(restrict_xpaths='//a[#href="CINE%2COBRA%2C-1%2C29"]'), follow=False, callback='parse_item'),
#Rule(LinkExtractor(allow=('CINE&OBRA&-1&29$', )), callback='parse_item', follow=False),
)
def parse_item(self, response):
item = TeatrorecurItem()
item['url']=response.url
yield item
a sample url i'm getting from this code is
<200 http://www.cartelera.com.uy/apeliculafunciones.aspx?-1=&12415=&29=&CINE=&OBRA=>
but the corresponding element in the page has the following href value
<a href="http://www.cartelera.com.uy/apeliculafunciones.aspx?12415&&CINE&OBRA&-1&29">
as you can see, the string following the .aspx? is messed up, i have no clue what is wrong.
LinkExtractor has a option named canonicalize that defaults to True.
Set it to False like so:
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29',), canonicalize=False), callback='parse_item', follow=False),
)
This will prevent LinkExtractor from performing changes to the url described at the def of canonicalize_url.
here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko