I am trying to get the job id of a scrapy 2.1.x job on spider_close method:
class mysql_pipeline(object):
import os
def test:
print(os.environ['SCRAPY_JOB'])
Unfortunatelly this results in a key error:
ERROR: Scraper close failure
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/andy/spider2/crawler/pipelines.py", line 137, in close_spider
os.environ['SCRAPY_JOB'],
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/os.py", line 675, in __getitem__
raise KeyError(key) from None
KeyError: 'SCRAPY_JOB'
2020-05-16 17:24:52 [scrapy
How can I pull the job id within the method?
In the spider constructor(inside init),
add the line -->
self.jobId = kwargs.get('_job')
then in the parse function pass this in item,
def parse(self, response):
data = {}
.............
yield data['_job']
in the pipeline add this -->
def process_item(self, item, spider):
self.jobId = item['jobId']
.......
def close_spider(self, spider):
print(self.jobId)
......
Related
I have a dict_ class that attempts to copy the built-in dict class.
Here is that class (the new function returns the original object):
class dict_:
def __init__(self, *args, **kwargs):
self.kv = kwargs
if not self.kv:
for kv in args:
for k, v in kv:
self.kv.update({k: v})
def __str__(self):
return "%s" % self.kv
def __getitem__(self, item):
return self.kv[item]
def update(self, *args):
self.kv.update(args)
I've called it like this:
from Dodger.dodger import *
term = new(System())
a = new(dict_(a=1, b=2))
a.update(new(dict_(c=3)))
term.println(a)
This is supposed to modify a to {"a": 1, "b": 2, "c": 3} but instead it gives me this error:
Traceback (most recent call last):
File "C:/free_time/Dodger/dodger_test.py", line 5, in <module>
File "C:\free_time\Dodger\dodger.py", line 176, in update
File "C:\free_time\Dodger\dodger.py", line 173, in __getitem__
KeyError: 0
Why is it giving a KeyError? What does the 0 mean? (I am using python 3.8.2)
I figured out how to solve this problem. I just have to implement __setitem__ too:
def __setitem__(self, key, value):
"""Set self[key] to value"""
self.kv[key] = value
I have the following (simplified) code:
import os
import scrapy
class TestSpider(scrapy.Spider):
name = 'test_spider'
start_urls = ['http://www.pdf995.com/samples/pdf.pdf', ]
def parse(self, response):
save_path = 'test'
file_name = 'test.pdf'
self.save_page(response, save_path, file_name)
def save_page(self, response, save_dir, file_name):
os.makedirs(save_dir, exist_ok=True)
with open(os.path.join(save_dir, file_name), 'wb') as afile:
afile.write(response.body)
When i run it, I get this error:
[scrapy.core.scraper] ERROR: Error downloading <GET http://www.pdf995.com/samples/pdf.pdf>
Traceback (most recent call last):
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "C:\Python36\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1278, in returnValue
raise _DefGen_Return(val)
twisted.internet.defer._DefGen_Return: <200 http://www.pdf995.com/samples/pdf.pdf>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "C:\Python36\lib\site-packages\scrapy\core\downloader\middleware.py", line 53, in process_response
spider=spider)
File "C:\Python36\lib\site-packages\scrapy_beautifulsoup\middleware.py", line 16, in process_response
return response.replace(body=str(BeautifulSoup(response.body, self.parser)))
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 79, in replace
return cls(*args, **kwargs)
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 20, in __init__
self._set_body(body)
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 55, in _set_body
"Response body must be bytes. "
TypeError: Response body must be bytes. If you want to pass unicode body use TextResponse or HtmlResponse.
Do I need to introduce a middleware or something to handle this? This looks like it should be valid, at least by other examples.
Note: at the moment I'm not using a pipeline because there in my real spider I have a lot of checks on whether the related item has been scraped, validating if this pdf belongs to the item, and checking a custom name of a pdf to see if it was downloaded. And as mentioned, many samples did what I'm doing here so I thought it would be easier and work.
The issue because of your own scrapy_beautifulsoup\middleware.py which is trying to replace the return response.replace(body=str(BeautifulSoup(response.body, self.parser))).
You need to correct that and that should fix the issue
I'm trying to have scrapy download a copy of each page it crawls but when I run my spider the log contains entries like
2016-06-20 15:39:12 [scrapy] ERROR: Error processing {'file_urls': 'http://example.com/page',
'title': u'PageTitle'}
Traceback (most recent call last):
File "c:\anaconda3\envs\scrapy\lib\site-packages\twisted\internet\defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\pipelines\media.py", line 44, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\pipelines\files.py", line 365, in get_media_requests
return [Request(x) for x in item.get(self.files_urls_field, [])]
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\http\request\__init__.py", line 25, in __init__
self._set_url(url)
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\http\request\__init__.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
Other questions in SO about this error seem to relate to problems with the start_urls but my start url is fine as the spider crawls across the site, it just doesn't save the page to my specified files_store.
I populate the file_urls using item['file_urls'] = response.url
Do I need to specify the url a different way?
I followed the following steps to create a module named "x_test" :
1. Settings -->Technical --> Database Structure --> Models --> Create
Model Description : Test
Model :x_test
2. Add Fields
Name: x_sample
Field Label: sample
Field Type: boolean
3. Save
4. Click on Create a Menu
5. Select appropriate menu and click on "CREATE MENU"
6. Click on the menu provided and try to provide values for my custom module
7. When I "SAVE" my record, I am getting the following error:
Traceback (most recent call last):
File "/var/app/openerp/server/openerp/netsvc.py", line 292, in dispatch_rpc
result = ExportService.getService(service_name).dispatch(method, params)
File "/var/app/openerp/server/openerp/service/web_services.py", line 626, in dispatch
res = fn(db, uid, *params)
File "/var/app/openerp/server/openerp/osv/osv.py", line 188, in execute_kw
return self.execute(db, uid, obj, method, *args, **kw or {})
File "/var/app/openerp/server/openerp/osv/osv.py", line 131, in wrapper
return f(self, dbname, *args, **kwargs)
File "/var/app/openerp/server/openerp/osv/osv.py", line 197, in execute
res = self.execute_cr(cr, uid, obj, method, *args, **kw)
File "/var/app/openerp/server/openerp/osv/osv.py", line 185, in execute_cr
return getattr(object, method)(cr, uid, *args, **kw)
File "/var/app/openerp/server/openerp/osv/orm.py", line 4434, in create
cr.execute('insert into "'+self._table+'" (id'+upd0+") values ("+str(id_new)+upd1+')', tuple(upd2))
File "/var/app/openerp/server/openerp/sql_db.py", line 161, in wrapper
return f(self, *args, **kwargs)
File "/var/app/openerp/server/openerp/sql_db.py", line 228, in execute
res = self._obj.execute(query, params)
ProgrammingError: column "x_sample" of relation "x_test" does not exist
LINE 1: insert into "x_test" (id,"x_sample",create_uid,create_date,wri...
Is there any mistake in my regards of creating a module via Web Interface
I had the same issue, go into the database and within the table x_test create the field you added in the model.
Now as far as I am concerned, it should create the fields for you. But I have had to create the fields manually.
I'm new user of scrapy to crawl my websites.I want to store data crawled into mysql database.
myspider.py:
class MininovaSpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = [Rule(SgmlLinkExtractor(allow=('/categorie/.*'),restrict_xpaths=('//div[#id="contLeftNavig"]',)), 'parse_t')]
def parse_t(self, response):
x = HtmlXPathSelector(response)
torrent = Torrent()
torrent['url'] = response.url
torrent['title']=x.select("//h1[#class='infoAneTitre']/text()").extract()
torrent['wilaya'] = x.select("//span[#class='ville_t']/text()").extract()
#torrent['prix'] = x.select("//div[#id='datail_ann']/ul[1]/li[4]/span/text()").extract()
#torrent['surface'] = x.select("//div[#id='datail_ann']/ul[3]/li[1]/span/text()").extract()
torrent['description'] = x.select("//div[#class='box_pad']/text()").extract()
return torrent
and for pipelines.py, i modified and used the example of googldir.So when i run crawl i get this error :
exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
exceptions.TypeError: 'MininovaSpider' object is not subscriptable
pipeline.py:
from scrapy import log
from twisted.enterprise import adbapi
import time
import MySQLdb.cursors
class Pipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='test',
user='root',
passwd='',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
tx.execute("select * from database where url = %s", (item['url'] ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute(\
"insert into database (wilaya,titre, site, lien,resume,timestamp) "
"values (%s, %s, %s, %s,%s,%s)",
(item['wilaya'],
item['title'],
'example.com',item['url'],item['description'],
time.time())
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
and traceback:
Traceback (most recent call last):
File "/usr/lib/python2.7/twisted/internet/defer.py", line 287, in addCallbacks
self._runCallbacks()
File "/usr/lib/python2.7/twisted/internet/defer.py", line 545, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/core/scraper.py", line 208, in _itemproc_finished
item=output, response=response, spider=spider)
File "/usr/lib/python2.7/site-packages/scrapy/utils/signal.py", line 53, in send_catch_log_deferred
*arguments, **named)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/internet/defer.py", line 134, in maybeDeferred
result = f(*args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/xlib/pydispatch/robustapply.py", line 47, in robustApply
return receiver(*arguments, **named)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/feedexport.py", line 177, in item_scraped
slot.exporter.export_item(item)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 109, in export_item
itemdict = dict(self._get_serialized_fields(item))
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 60, in _get_serialized_fields
field_iter = item.iterkeys()
**exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
2012-01-18 16:00:43-0600 [scrapy] Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 503, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 530, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 483, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/python/threadpool.py", line 207, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/twisted/enterprise/adbapi.py", line 448, in _runInteraction
result = interaction(trans, *args, **kw)
File "/opt/scrapy/test/pipelines.py", line 33, in _conditional_insert
tx.execute("select * from database where url = %s", (item['url'] ))
**exceptions.TypeError: 'MininovaSpider' object is not subscriptable
exceptions.TypeError: 'MininovaSpider' object is not subscriptable
Looks like you have yielded somewhere a spider (MininovaSpider) instance instead of an item. I think you have there more code you haven't shown.
In Pipeline.process_item() put this to confirm:
def process_item(self, spider, item):
assert isinstance(item, Torrent), 'Here should be Torrent instance!'
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item