I'm trying to create a spider, which is running all the time and as soon as it arrives in it's idle state, it should fetch the next url to parse from the database.
Unfortunately, I got stack at the very beginning already:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import signals
from scrapy import Spider
import logging
class SignalspiderSpider(Spider):
name = 'signalspider'
allowed_domains = ['domain.de']
yet = False
def start_requests(self):
logging.log(logging.INFO, "______ Loading requests")
yield scrapy.Request('https://www.domain.de/product1.html')
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
logging.log(logging.INFO, "______ From Crawler")
spider = spider = super(SignalspiderSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.idle, signal=scrapy.signals.spider_idle)
return spider
def parse(self, response):
self.logger.info("______ Finished extracting structured data from HTML")
pass
def idle(self):
logging.log(logging.INFO, "_______ Idle state")
if not self.yet:
self.crawler.engine.crawl(self.create_request(), self)
self.yet = True
def create_request(self):
logging.log(logging.INFO, "_____________ Create requests")
yield scrapy.Request('https://www.domain.de/product2.html?dvar_82_color=blau&cgid=')
and the error that I get:
2019-03-27 21:41:38 [root] INFO: _______ Idle state
2019-03-27 21:41:38 [root] INFO: _____________ Create requests
2019-03-27 21:41:38 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method RefererMiddleware.request_scheduled of <scrapy.spidermiddlewares.referer.RefererMiddleware object at 0x7f93bcc13978>>
Traceback (most recent call last):
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log
*arguments, **named)
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/pydispatch/robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled
redirected_urls = request.meta.get('redirect_urls', [])
AttributeError: 'NoneType' object has no attribute 'meta'
What am I doing wrong?
Try with:
def idle(self, spider):
logging.log(logging.INFO, "_______ Idle state")
if not self.yet:
self.yet = True
self.crawler.engine.crawl(Request(url='https://www.domain.de/product2.html?dvar_82_color=blau&cgid=', callback=spider.parse), spider)
I'm not sure if it is correct to create a request in method spider_idle, passing another method wich makes the request, like you do.
See more at Scrapy spider_idle signal - need to add requests with parse item callback
Related
I have a issue about scrapyd api.
I write simple spider, it gets domain url as a argument.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, domains=None):
self.allowed_domains = [domains]
self.start_urls = ['http://{}/'.format(domains)]
def parse(self, response):
# time.sleep(int(self.sleep))
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
yield item
It works perfect if I run it like
scrapy crawl quotes -a domains=quotes.toscrape.com
But when time comes to run it via scrapyd_api it goes wrong:
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://localhost:6800')
scrapyd.schedule(project='pd', spider='quotes', domains='http://quotes.toscrape.com/')
I get - builtins.TypeError: init() got an unexpected keyword argument '_job'
How can I start scrapy spiders via scrapyd api with args?
it is an answer.
According to this answer I was wrong with super method.
now my code looks like this:
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = []
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.allowed_domains = [kwargs.get('domains')]
self.start_urls.append('http://{}/'.format(kwargs.get('domains')))
I would like to repeatedly scrape the same URLs with different delays. After researching the issue it seemed that the appropriate solution was to use something like
nextreq = scrapy.Request(url, dont_filter=True)
d = defer.Deferred()
delay = 1
reactor.callLater(delay, d.callback, nextreq)
yield d
in parse.
However, I have been unable to make this work. I am getting the error message
ERROR: Spider must return Request, BaseItem, dict or None, got 'Deferred'
I am not familiar with twisted so I hope I am just missing something obvious
Is there a better way of achieving my goal that doesn't fight the framework so much?
I finally found an answer in an old PR
def parse():
req = scrapy.Request(...)
delay = 0
reactor.callLater(delay, self.crawler.engine.schedule, request=req, spider=self)
However, the spider can exit due to being idle too early. Based on the outdated middleware https://github.com/ArturGaspar/scrapy-delayed-requests, this can be remedied with
from scrapy import signals
from scrapy.exceptions import DontCloseSpider
class ImmortalSpiderMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_idle, signal=signals.spider_idle)
return s
#classmethod
def spider_idle(cls, spider):
raise DontCloseSpider()
The final option, updating the middleware by ArturGaspar, led to:
from weakref import WeakKeyDictionary
from scrapy import signals
from scrapy.exceptions import DontCloseSpider
from twisted.internet import reactor
class DelayedRequestsMiddleware(object):
requests = WeakKeyDictionary()
#classmethod
def from_crawler(cls, crawler):
ext = cls()
crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
return ext
#classmethod
def spider_idle(cls, spider):
if cls.requests.get(spider):
spider.log("delayed requests pending, not closing spider")
raise DontCloseSpider()
def process_request(self, request, spider):
delay = request.meta.pop('delay_request', None)
if delay:
self.requests.setdefault(spider, 0)
self.requests[spider] += 1
reactor.callLater(delay, self.schedule_request, request.copy(),
spider)
raise IgnoreRequest()
def schedule_request(self, request, spider):
spider.crawler.engine.schedule(request, spider)
self.requests[spider] -= 1
And can be used in parse like:
yield Request(..., meta={'delay_request': 5})
Here is a simple python file--test.py.
import math
class myClass():
def myFun(self,x):
return(math.sqrt(x))
if __name__ == "__main__":
myInstance=myClass()
print(myInstance.myFun(9))
It print 3 with python test.py,let's analyse the running process.
1. to instance myClass and assign it to myInstance.
2.to call myFun function and print the result.
It is scrapy's turn.
In the scrapy1.4 manual,quotes_spider.py is as below.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
To run the spider with scrapy crawl quotes,i am puzzled:
1.Where is the main function or main body for the spider?
2.Which class was instanced?
3.Which method was called?
mySpider = QuotesSpider(scrapy.Spider)
mySpider.parse(response)
How scrapy crawl work exactly?
So let's start. Assuming you use linux/mac. Let's check where us scrapy
$ which scrapy
/Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
Let's look at the content of this file
$ cat /Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
#!/Users/tarun.lalwani/.virtualenvs/myproject/bin/python3.6
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
So this executes execute method from cmdline.py and her is your main method.
cmdline.py
from __future__ import print_function
....
....
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
if __name__ == '__main__':
execute()
Now if you notice execute method it processes the arguments passed by you. which is crawl quotes in your case. The execute methods scans the projects for classes and check which has name defined as quotes. It creates the CrawlerProcess class and that runs the whole show.
Scrapy is based on Twisted Python Framework. Which is a scheduler based framework.
Consider the below part of the code
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
When the engine executes this function and first yield is execute. The value is returned to the engined. The engine now looks at other task that are pending executes them, (when they yield, some other pending task queue function gets a chance). So yield is what allows to break a function execution into parts and help Scrapy/Twisted work.
You can get a detailed overview on the link below
https://doc.scrapy.org/en/latest/topics/architecture.html
I am using Scrapy 1.1 and I call Scrapy from within a script. My spider launching method looks like this:
def run_spider(self):
runner = CrawlerProcess(get_project_settings())
spider = SiteSpider()
configure_logging()
d = runner.crawl(spider, websites_file=self.raw_data_file)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Here is an extract of my spider with an errback written as in the documentation, but it only prints when catches a failure.
class SiteSpider(scrapy.Spider):
name = 'SiteCrawler'
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'result.json',
}
def __init__(self, websites_file=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.websites_file = websites_file
print('***********')
print(self.websites_file)
def start_requests(self):
.....
if is_valid_url(website_url):
yield scrapy.Request(url=website_url, callback=self.parse, errback=self.handle_errors, meta={'url': account_id})
def parse(self, response):
.....
yield item
def handle_errors(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
print('HttpError on ' + response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print('DNSLookupError on ' + request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
print('TimeoutError on ' + request.url)
My problem is that I get errors I expect, like:
TimeoutError on http://www.example.com
But also get tracebacks for the same websites:
2016-08-05 13:40:55 [scrapy] ERROR: Error downloading <GET http://www.example.com/robots.txt>: TCP connection timed out: 60: Operation timed out.
Traceback (most recent call last):
File ".../anaconda/lib/python3.5/site-packages/twisted/internet/defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File ".../anaconda/lib/python3.5/site-packages/twisted/python/failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File ".../anaconda/lib/python3.5/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.TCPTimedOutError: TCP connection timed out: 60: Operation timed out.
The written exception handling messages and the tracebacks can often be traced to the same websites. After searching a lot on stackoverflow, in the docs and the likes I still dont know why I see the tracebacks.
This also occurs with DNSLookupErrors for example.
Excuse me, my Scrapy knowledge is juvenile. Is this normal behavior?
Also, I added this to settings.py, which is under my crawler. Other entires (for example the item_pipelines) most exactly work.
LOG_LEVEL = 'WARNING'
But I still see debug messages, not only warnings and everything above that. (if configure_logging() is added to the spider launch) I am running this from terminal on mac os x.
I would be very happy to get any help with this.
Try this in a script:
if __name__ == '__main__':
runner = CrawlerProcess(get_project_settings())
spider = SiteSpider()
configure_logging()
d = runner.crawl(spider, websites_file=self.raw_data_file)
d.addBoth(lambda _: reactor.stop())
reactor.run()
I'm running a Twisted TCP server. Each protocol instance has an mqtt pub/sub client. I've reduced the actual production code to the simplest possible form below. I've stripped out a lot of irrelevant complexity to simplify the bug-finding process. The server works for multiple client connections and produces the data received from the mqtt client, but after any client connects/disconnects/reconnects a few times I get an exception that I haven't been able to understand or trap. I'm hoping that Jean-Paul or someone can point me at the error of my ways.
Once the exception occurs, no new clients can connect to the server. Each new connect attempt produces the exception.
Clients that are already connected continue to receive data ok.
The exception is
Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/python/log.py", line 73, in callWithContext
return context.call({ILogContext: newCtx}, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/posixbase.py", line 614, in _doReadOrWrite
why = selectable.doRead()
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/tcp.py", line 1069, in doRead
transport = self.transport(skt, protocol, addr, self, s, self.reactor)
File "/usr/lib/python2.7/dist-packages/twisted/internet/tcp.py", line 786, in __init__
self.startReading()
File "/usr/lib/python2.7/dist-packages/twisted/internet/abstract.py", line 429, in startReading
self.reactor.addReader(self)
File "/usr/lib/python2.7/dist-packages/twisted/internet/epollreactor.py", line 256, in addReader
_epoll.EPOLLIN, _epoll.EPOLLOUT)
File "/usr/lib/python2.7/dist-packages/twisted/internet/epollreactor.py", line 240, in _add
self._poller.modify(fd, flags)
exceptions.IOError: [Errno 2] No such file or directory
The basic server code is:
(this example will run and does generate the exception)
from twisted.internet import reactor
from twisted.internet.protocol import Factory
from twisted.protocols import basic
from paho.mqtt import client # The most recent version of the legacy Mosquitto client
from random import randint
class MsgReceiver(basic.LineReceiver):
def __init__(self, factory): # new (factory)
self.factory = factory # new (factory)
def connectionMade(self):
self.mqClient = self.startMQ()
if self.mqClient:
self.factory.clients.append(self)
else:
self.transport.loseConnection()
def connectionLost(self, reason):
pass
def lineReceived(self, line):
pass
def on_message(self, mosq, obj, msg):
try:
self.sendLine(msg.payload)
except Exception, err:
print(err.message)
def startMQ(self):
mqName = "-".join(["myAppName", str(randint(0, 99999))])
mqClient = client.Client(mqName)
if mqClient.connect("localhost", 1883, 60) != 0:
print('Could not connect to mq server')
return False
mqClient.on_message = self.on_message
(success, mid) = mqClient.subscribe("myTopic", 0)
if success != 0:
return False
mqClient.loop_start()
return mqClient
class MsgReceiverFactory(Factory):
allow_reuse_address = True
def __init__(self, clients):
self.clients = clients
def buildProtocol(self, addr):
return MsgReceiver(self)
if __name__ == "__main__":
try:
clients = []
reactor.listenTCP(43217, MsgReceiverFactory(clients))
reactor.run()
except Exception, err:
print(err.message)
if reactor.running:
reactor.stop()
A simple client that will induce the error when run twice (the first time it runs fine):
Interesting that if I enable the time.sleep(3) it runs fine and doesn't seem to induce the error
#!/usr/bin/python
from __future__ import print_function
import socket
import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(("localhost", 43217))
data = s.recv(1024)
print(data)
#time.sleep(3)
s.close()