Wrote an errback for my Scrapy spider, but tracebacks also keep happening, why? - scrapy

I am using Scrapy 1.1 and I call Scrapy from within a script. My spider launching method looks like this:
def run_spider(self):
runner = CrawlerProcess(get_project_settings())
spider = SiteSpider()
configure_logging()
d = runner.crawl(spider, websites_file=self.raw_data_file)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Here is an extract of my spider with an errback written as in the documentation, but it only prints when catches a failure.
class SiteSpider(scrapy.Spider):
name = 'SiteCrawler'
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'result.json',
}
def __init__(self, websites_file=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.websites_file = websites_file
print('***********')
print(self.websites_file)
def start_requests(self):
.....
if is_valid_url(website_url):
yield scrapy.Request(url=website_url, callback=self.parse, errback=self.handle_errors, meta={'url': account_id})
def parse(self, response):
.....
yield item
def handle_errors(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
print('HttpError on ' + response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print('DNSLookupError on ' + request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
print('TimeoutError on ' + request.url)
My problem is that I get errors I expect, like:
TimeoutError on http://www.example.com
But also get tracebacks for the same websites:
2016-08-05 13:40:55 [scrapy] ERROR: Error downloading <GET http://www.example.com/robots.txt>: TCP connection timed out: 60: Operation timed out.
Traceback (most recent call last):
File ".../anaconda/lib/python3.5/site-packages/twisted/internet/defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File ".../anaconda/lib/python3.5/site-packages/twisted/python/failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File ".../anaconda/lib/python3.5/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.TCPTimedOutError: TCP connection timed out: 60: Operation timed out.
The written exception handling messages and the tracebacks can often be traced to the same websites. After searching a lot on stackoverflow, in the docs and the likes I still dont know why I see the tracebacks.
This also occurs with DNSLookupErrors for example.
Excuse me, my Scrapy knowledge is juvenile. Is this normal behavior?
Also, I added this to settings.py, which is under my crawler. Other entires (for example the item_pipelines) most exactly work.
LOG_LEVEL = 'WARNING'
But I still see debug messages, not only warnings and everything above that. (if configure_logging() is added to the spider launch) I am running this from terminal on mac os x.
I would be very happy to get any help with this.

Try this in a script:
if __name__ == '__main__':
runner = CrawlerProcess(get_project_settings())
spider = SiteSpider()
configure_logging()
d = runner.crawl(spider, websites_file=self.raw_data_file)
d.addBoth(lambda _: reactor.stop())
reactor.run()

Related

Scrapyd api get and exeption when I try to start spider

I have a issue about scrapyd api.
I write simple spider, it gets domain url as a argument.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
def __init__(self, domains=None):
self.allowed_domains = [domains]
self.start_urls = ['http://{}/'.format(domains)]
def parse(self, response):
# time.sleep(int(self.sleep))
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
yield item
It works perfect if I run it like
scrapy crawl quotes -a domains=quotes.toscrape.com
But when time comes to run it via scrapyd_api it goes wrong:
from scrapyd_api import ScrapydAPI
scrapyd = ScrapydAPI('http://localhost:6800')
scrapyd.schedule(project='pd', spider='quotes', domains='http://quotes.toscrape.com/')
I get - builtins.TypeError: init() got an unexpected keyword argument '_job'
How can I start scrapy spiders via scrapyd api with args?
it is an answer.
According to this answer I was wrong with super method.
now my code looks like this:
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = []
def __init__(self, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.allowed_domains = [kwargs.get('domains')]
self.start_urls.append('http://{}/'.format(kwargs.get('domains')))

Scrapy - idle signal spider running into an error

I'm trying to create a spider, which is running all the time and as soon as it arrives in it's idle state, it should fetch the next url to parse from the database.
Unfortunately, I got stack at the very beginning already:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import signals
from scrapy import Spider
import logging
class SignalspiderSpider(Spider):
name = 'signalspider'
allowed_domains = ['domain.de']
yet = False
def start_requests(self):
logging.log(logging.INFO, "______ Loading requests")
yield scrapy.Request('https://www.domain.de/product1.html')
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
logging.log(logging.INFO, "______ From Crawler")
spider = spider = super(SignalspiderSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.idle, signal=scrapy.signals.spider_idle)
return spider
def parse(self, response):
self.logger.info("______ Finished extracting structured data from HTML")
pass
def idle(self):
logging.log(logging.INFO, "_______ Idle state")
if not self.yet:
self.crawler.engine.crawl(self.create_request(), self)
self.yet = True
def create_request(self):
logging.log(logging.INFO, "_____________ Create requests")
yield scrapy.Request('https://www.domain.de/product2.html?dvar_82_color=blau&cgid=')
and the error that I get:
2019-03-27 21:41:38 [root] INFO: _______ Idle state
2019-03-27 21:41:38 [root] INFO: _____________ Create requests
2019-03-27 21:41:38 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method RefererMiddleware.request_scheduled of <scrapy.spidermiddlewares.referer.RefererMiddleware object at 0x7f93bcc13978>>
Traceback (most recent call last):
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log
*arguments, **named)
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/pydispatch/robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled
redirected_urls = request.meta.get('redirect_urls', [])
AttributeError: 'NoneType' object has no attribute 'meta'
What am I doing wrong?
Try with:
def idle(self, spider):
logging.log(logging.INFO, "_______ Idle state")
if not self.yet:
self.yet = True
self.crawler.engine.crawl(Request(url='https://www.domain.de/product2.html?dvar_82_color=blau&cgid=', callback=spider.parse), spider)
I'm not sure if it is correct to create a request in method spider_idle, passing another method wich makes the request, like you do.
See more at Scrapy spider_idle signal - need to add requests with parse item callback

How scrapy crawl work:which class instanced and which method called?

Here is a simple python file--test.py.
import math
class myClass():
def myFun(self,x):
return(math.sqrt(x))
if __name__ == "__main__":
myInstance=myClass()
print(myInstance.myFun(9))
It print 3 with python test.py,let's analyse the running process.
1. to instance myClass and assign it to myInstance.
2.to call myFun function and print the result.
It is scrapy's turn.
In the scrapy1.4 manual,quotes_spider.py is as below.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
To run the spider with scrapy crawl quotes,i am puzzled:
1.Where is the main function or main body for the spider?
2.Which class was instanced?
3.Which method was called?
mySpider = QuotesSpider(scrapy.Spider)
mySpider.parse(response)
How scrapy crawl work exactly?
So let's start. Assuming you use linux/mac. Let's check where us scrapy
$ which scrapy
/Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
Let's look at the content of this file
$ cat /Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
#!/Users/tarun.lalwani/.virtualenvs/myproject/bin/python3.6
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
So this executes execute method from cmdline.py and her is your main method.
cmdline.py
from __future__ import print_function
....
....
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
if __name__ == '__main__':
execute()
Now if you notice execute method it processes the arguments passed by you. which is crawl quotes in your case. The execute methods scans the projects for classes and check which has name defined as quotes. It creates the CrawlerProcess class and that runs the whole show.
Scrapy is based on Twisted Python Framework. Which is a scheduler based framework.
Consider the below part of the code
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
When the engine executes this function and first yield is execute. The value is returned to the engined. The engine now looks at other task that are pending executes them, (when they yield, some other pending task queue function gets a chance). So yield is what allows to break a function execution into parts and help Scrapy/Twisted work.
You can get a detailed overview on the link below
https://doc.scrapy.org/en/latest/topics/architecture.html

Twisted error in server with mqtt client

I'm running a Twisted TCP server. Each protocol instance has an mqtt pub/sub client. I've reduced the actual production code to the simplest possible form below. I've stripped out a lot of irrelevant complexity to simplify the bug-finding process. The server works for multiple client connections and produces the data received from the mqtt client, but after any client connects/disconnects/reconnects a few times I get an exception that I haven't been able to understand or trap. I'm hoping that Jean-Paul or someone can point me at the error of my ways.
Once the exception occurs, no new clients can connect to the server. Each new connect attempt produces the exception.
Clients that are already connected continue to receive data ok.
The exception is
Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/python/log.py", line 73, in callWithContext
return context.call({ILogContext: newCtx}, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/posixbase.py", line 614, in _doReadOrWrite
why = selectable.doRead()
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/tcp.py", line 1069, in doRead
transport = self.transport(skt, protocol, addr, self, s, self.reactor)
File "/usr/lib/python2.7/dist-packages/twisted/internet/tcp.py", line 786, in __init__
self.startReading()
File "/usr/lib/python2.7/dist-packages/twisted/internet/abstract.py", line 429, in startReading
self.reactor.addReader(self)
File "/usr/lib/python2.7/dist-packages/twisted/internet/epollreactor.py", line 256, in addReader
_epoll.EPOLLIN, _epoll.EPOLLOUT)
File "/usr/lib/python2.7/dist-packages/twisted/internet/epollreactor.py", line 240, in _add
self._poller.modify(fd, flags)
exceptions.IOError: [Errno 2] No such file or directory
The basic server code is:
(this example will run and does generate the exception)
from twisted.internet import reactor
from twisted.internet.protocol import Factory
from twisted.protocols import basic
from paho.mqtt import client # The most recent version of the legacy Mosquitto client
from random import randint
class MsgReceiver(basic.LineReceiver):
def __init__(self, factory): # new (factory)
self.factory = factory # new (factory)
def connectionMade(self):
self.mqClient = self.startMQ()
if self.mqClient:
self.factory.clients.append(self)
else:
self.transport.loseConnection()
def connectionLost(self, reason):
pass
def lineReceived(self, line):
pass
def on_message(self, mosq, obj, msg):
try:
self.sendLine(msg.payload)
except Exception, err:
print(err.message)
def startMQ(self):
mqName = "-".join(["myAppName", str(randint(0, 99999))])
mqClient = client.Client(mqName)
if mqClient.connect("localhost", 1883, 60) != 0:
print('Could not connect to mq server')
return False
mqClient.on_message = self.on_message
(success, mid) = mqClient.subscribe("myTopic", 0)
if success != 0:
return False
mqClient.loop_start()
return mqClient
class MsgReceiverFactory(Factory):
allow_reuse_address = True
def __init__(self, clients):
self.clients = clients
def buildProtocol(self, addr):
return MsgReceiver(self)
if __name__ == "__main__":
try:
clients = []
reactor.listenTCP(43217, MsgReceiverFactory(clients))
reactor.run()
except Exception, err:
print(err.message)
if reactor.running:
reactor.stop()
A simple client that will induce the error when run twice (the first time it runs fine):
Interesting that if I enable the time.sleep(3) it runs fine and doesn't seem to induce the error
#!/usr/bin/python
from __future__ import print_function
import socket
import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(("localhost", 43217))
data = s.recv(1024)
print(data)
#time.sleep(3)
s.close()

Getting URLError Exception when caching webdriver instances

I am attempting to cache webdriver instances across test case classes. I do not need a "clean" webdriver since I am simply using PhantomJS to query the DOM (I do need JavaScript enabled, which is why I am not simply fetching the source and parsing that).
The cache is a dictionary with the URL as a key and the driver instance as value. The cache is in the base test case, and I call get() which is a method on the base test case. This method instantiates webdriver, and goes to the url if the driver is not in the cache already.
It appears there's some kind of socket issue when trying to access driver properties on the cached instance in the second test case (derivedb.py). I'd appreciate if someone could tell how to get this work.
I am getting the following output:
$ python launcher.py
test_a (deriveda.DerivedTestCaseA) ... Instantiate new driver
Title is: Google
ok
test_b (deriveda.DerivedTestCaseA) ... Retrieve driver from cache
Title is: Google
ok
test_a (derivedb.DerivedTestCaseB) ... Retrieve driver from cache
ERROR
======================================================================
ERROR: test_a (derivedb.DerivedTestCaseB)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Users/cohenaa/PycharmProjects/sanity/derivedb.py", line 7, in test_a
print "Title is: %s" % self.driver.title
File "/Users/cohenaa/sanity-env/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 185, in title
resp = self.execute(Command.GET_TITLE)
File "/Users/cohenaa/sanity-env/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 162, in execute
response = self.command_executor.execute(driver_command, params)
File "/Users/cohenaa/sanity-env/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 349, in execute
return self._request(url, method=command_info[0], data=data)
File "/Users/cohenaa/sanity-env/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 410, in _request
resp = opener.open(request)
File "/sw/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/sw/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/sw/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/sw/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/sw/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 61] Connection refused>
----------------------------------------------------------------------
Ran 3 tests in 1.271s
FAILED (errors=1)
launcher.py
import unittest
from deriveda import DerivedTestCaseA
from derivedb import DerivedTestCaseB
suite = unittest.TestSuite()
testclasses = [DerivedTestCaseA, DerivedTestCaseB]
testloader = unittest.TestLoader()
classes_to_names = {}
for tc in testclasses:
classes_to_names[tc] = testloader.getTestCaseNames(tc)
for tc in classes_to_names:
for testname in classes_to_names[tc]:
suite.addTest(tc(testname))
unittest.TextTestRunner(verbosity=10).run(suite)
deriveda.py
from basetestcase import BaseTestCase
from unittest import main
class DerivedTestCaseA(BaseTestCase):
def test_a(self):
self.get("http://www.google.com")
print "Title is: %s" % self.driver.title
def test_b(self):
self.get("http://www.google.com")
print "Title is: %s" % self.driver.title
derivedb.py
from basetestcase import BaseTestCase
class DerivedTestCaseB(BaseTestCase):
def test_a(self):
self.get("http://www.google.com")
print "Title is: %s" % self.driver.title
basetestcase.py:
import unittest
from selenium import webdriver
class BaseTestCase(unittest.TestCase):
cache = {}
def get(self, url):
if url not in self.cache:
print "Instantiate new driver"
self.driver = webdriver.PhantomJS()
self.driver.get(url)
self.cache[url] = self.driver
else:
print "Retrieve driver from cache"
self.driver = self.cache[url]
#classmethod
def tearDownClass(cls):
for url in BaseTestCase.cache:
BaseTestCase.cache[url].quit()
Ahhh. I see now. The driver is quitting after each test case. If I quit after the suite is run instead, no error.