# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.response import open_in_browser
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
#from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
class ExampleSpider(scrapy.Spider):
name = 'forever'
allowed_domains = ['example.com']
kohlind = max_kohls = 0
total_products = 0
colected = 0
items = []
#AWS_ACCESS_KEY_ID = 'id'
#AWS_SECRET_ACCESS_KEY = 'pass'
start_urls=['https://www.example.com/']
custom_settings = {'FEED_URI' : f's3://example-products/fulltest.csv',
'FEED_EXPORT_FIELDS': ['ITEM_ID','URL','SELLER','PRICE','SALE_PRICE','MAIN_IMAGE','OTHER_IMAGE','SKU','PRODUCT_NAME']
}
def __init__(self):
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
#spider code
def _close(self):
print(f"\n\nClosing Spider with {len(self.items)} New Items")
for i in self.items:
if "URL" in i.keys():
yield item
print("Done")
Program is not connecting to _close function, no error is found and the yield items in spider code are uploaded normally (except the _close nothing happens)
I tried removing the s3 in the settings, It worked fine (i.e entered the _close function)
How can I fix?
Try this code below, and it should work
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(scrapy.Spider):
name = 'forever'
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print(f"\n\nClosing Spider with {len(self.items)} New Items")
Related
I am currently trying to crawl data from three websites (three different URLs). Therefore, I am using a text-file to load the different URLs into the start_url.
At the moment, there are three URLs in my file. However, the script just saves/overwrites the data of the two URLs before.
This is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from time import sleep
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
import csv
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [l.strip() for l in f.readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(self.driver.current_url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
I am having issues fetching the start_urls for Scrapy to run from a table in MySQL.
MySQL has a database called "scrapy", a table called "urls" with a single column called "url" which has on each row a URL to scrape.
This is the code I have currently, but I feel like I am missing a concept somewhere:
# -*- coding: utf-8 -*-
import scrapy
import datetime
import urlparse
import socket
import MySQLdb
from scrapy.loader import ItemLoader
from example.items import exampleitem
class instantdbSpider(scrapy.Spider):
name = 'instantdb'
allowed_domains = ['example.com']
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='Password!',
db='scrapy',
host='localhost',
charset="utf8",
use_unicode=True,
cursorclass=MySQLdb.cursors.DictCursor
)
cursor = conn.cursor()
cursor.execute('SELECT * FROM urls')
rows = cursor.fetchall()
for row in rows:
url = row["url"]
yield Request(url=url)
def parse(self, response):
l = ItemLoader(item=exampleitem(), response=response)
#Scrape Fields
l.add_xpath('title', '//html/head/title/text()')
l.add_xpath('sku', '//*[#id="js-zoom-image-container"]/div[5]/h2/strong/text()')
l.add_xpath('price', '//*[#id="main-content"]/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/p/text()[1]')
l.add_xpath('product_title', '//html/body/div[1]/span[4]/text()')
l.add_xpath('image_url', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/a/img/#src')
l.add_xpath('description', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[7]')
# Administration Fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()
Any help would be greatly appreciated as I seem to be going around in circles. Thanks.
You have an indentation issue:
for row in rows:
url = row["url"]
yield Request(url=url)
Bear with me. I'm writing every detail because so many parts of the toolchain do not handle Unicode gracefully and it's not clear what is failing.
PRELUDE
We first set up and use a recent Scrapy.
source ~/.scrapy_1.1.2/bin/activate
Since the terminal's default is ascii, not unicode, we set:
export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
Also since by default Python uses ascii, we modify the encoding:
export PYTHONIOENCODING="utf_8"
Now we're ready to start a Scrapy project.
scrapy startproject myproject
cd myproject
scrapy genspider dorf PLACEHOLDER
We're told we now have a spider.
Created spider 'dorf' using template 'basic' in module:
myproject.spiders.dorf
We modify myproject/items.py to be:
# -*- coding: utf-8 -*-
import scrapy
class MyprojectItem(scrapy.Item):
title = scrapy.Field()
ATTEMPT 1
Now we write the spider, relying on urllib.unquote
# -*- coding: utf-8 -*-
import scrapy
import urllib
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = urllib.unquote(
response.xpath('//title').extract_first().encode('ascii')
).decode('utf8')
return item
And finally we use a custom item exporter (from all the way back in Oct 2011)
# -*- coding: utf-8 -*-
import json
from scrapy.exporters import BaseItemExporter
class UnicodeJsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs)
self.file = file
self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(self.encoder.encode(itemdict) + '\n')
and add
FEED_EXPORTERS = {
'json': 'myproject.exporters.UnicodeJsonLinesItemExporter',
}
to myproject/settings.py.
Now we run
~/myproject> scrapy crawl dorf -o dorf.json -t json
we get
UnicodeEncodeError: 'ascii' codec can't encode character u'\xfc' in position 25: ordinal not in range(128)
ATTEMPT 2
Another solution (the candidate solution for Scrapy 1.2?) is to use the spider
# -*- coding: utf-8 -*-
import scrapy
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = response.xpath('//title')[0].extract()
return item
and the custom item exporter
# -*- coding: utf-8 -*-
from scrapy.exporters import JsonItemExporter
class Utf8JsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
super(Utf8JsonItemExporter, self).__init__(
file, ensure_ascii=False, **kwargs)
with
FEED_EXPORTERS = {
'json': 'myproject.exporters.Utf8JsonItemExporter',
}
in myproject/settings.py.
We get the following JSON file.
[
{"title": "<title>Sister cities of D\u00fcsseldorf \u2014 sistercity.info</title>"}
]
The Unicode is not UTF-8 encoded. Although this is a trivial problem for a couple of characters, it becomes a serious issue if the entire output is in a foreign language.
How can I get an output in UTF-8 unicode?
In Scrapy 1.2+ there is a FEED_EXPORT_ENCODING option. When FEED_EXPORT_ENCODING = "utf-8" escaping of non-ascii symbols in JSON output is turned off.
please try this on your Attempt 1 and let me know if it works (I've test it without setting all those env. variables)
def to_write(uni_str):
return urllib.unquote(uni_str.encode('utf8')).decode('utf8')
class CitiesSpider(scrapy.Spider):
name = "cities"
allowed_domains = ["sitercity.info"]
start_urls = (
'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
for i in range(2):
item = SimpleItem()
item['title'] = to_write(response.xpath('//title').extract_first())
item['url'] = to_write(response.url)
yield item
the range(2) is for testing the json exporter, to get a list of dicts you can do this instead:
# -*- coding: utf-8 -*-
from scrapy.contrib.exporter import JsonItemExporter
from scrapy.utils.serialize import ScrapyJSONEncoder
class UnicodeJsonLinesItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
self.encoder = ScrapyJSONEncoder(ensure_ascii=False, **kwargs)
self.first_item = True
I want to schedule my spider to run again in 1 hour when crawling is finished. In my code spider_closed method is calling after crawling end. Now How to run the spider again from this method. Or are there any available settings to schedule the scrapy spider.
Here is my basic spider code.
import scrapy
import codecs
from a2i.items import A2iItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
class A2iSpider(scrapy.Spider):
name = "notice"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
allowed_domains = ["prothom-alo.com"]
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def parse(self, response):
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
print "*"*70
print url
print "\n\n"
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':2,'url' : url})
def parse_page(self, response):
filename = "response.txt"
depth = response.meta['depth']
with open(filename, 'a') as f:
f.write(str(depth))
f.write("\n")
f.write(response.meta['url'])
f.write("\n")
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':depth+1,'url' : url})
def spider_closed(self, spider):
print "$"*2000
You can use cron.
crontab -e to create schedule and run scripts as root, or
crontab -u [user] -e to run as a specific user.
at the bottom you can add
0 * * * * cd /path/to/your/scrapy && scrapy crawl [yourScrapy] >> /path/to/log/scrapy_log.log
0 * * * * makes the script runs hourly, and I believe you can find more details about the settings online.
You can run your spider with the JOBDIR setting, it will save your requests loaded in the scheduler
scrapy crawl somespider -s JOBDIR=crawls/somespider-1
https://doc.scrapy.org/en/latest/topics/jobs.html
I am a newbie to scrapy.I am trying to write a spider to download images.for using the image pipeline,is installing PIL sufficient?My PIL is located in
/usr/lib/python2.7/dist-packages/PIL
How do I include it in my Scrapy project?
settings file:
BOT_NAME = 'paulsmith'
BOT_VERSION = '1.0'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGE_STORE = '/home/jay/Scrapy/paulsmith/images'
SPIDER_MODULES = ['paulsmith.spiders']
NEWSPIDER_MODULE = 'paulsmith.spiders'
DEFAULT_ITEM_CLASS = 'paulsmith.items.PaulsmithItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
Items file:
from scrapy.item import Item, Field
class PaulsmithItem(Item):
image_urls=Field()
image = Field()
pass
Spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from paulsmith.items import PaulsmithItem
class PaulSmithSpider(BaseSpider):
name="Paul"
allowed_domains=["http://www.paulsmith.co.uk/uk-en/shop/mens"]
start_urls=["http://www.paulsmith.co.uk/uk-en/shop/mens/jeans"]
def parse(self,response):
item= PaulsmithItem()
#open('paulsmith.html','wb').write(response.body)
hxs=HtmlXPathSelector(response)
#sites=hxs.select('//div[#class="category-products"]')
item['image_urls']=hxs.select("//div[#class='category-products']//a/img/#src").extract()
#for site in sites:
#print site.extract()
#image = site.select('//a/img/#src').extract()
return item
SPIDER = PaulSmithSpider()
You may have not set IMAGES_STORE = '/path/to/valid/dir'
morever, try to use a Custom Images pipeline like this:
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
You can check whether image_urls are requested from method "get_media_requests"
reference: http://doc.scrapy.org/en/latest/topics/images.html