How to retrieve scrpy job id within method? - scrapy

I am trying to get the job id of a scrapy 2.1.x job on spider_close method:
class mysql_pipeline(object):
import os
def test:
print(os.environ['SCRAPY_JOB'])
Unfortunatelly this results in a key error:
ERROR: Scraper close failure
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/andy/spider2/crawler/pipelines.py", line 137, in close_spider
os.environ['SCRAPY_JOB'],
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/os.py", line 675, in __getitem__
raise KeyError(key) from None
KeyError: 'SCRAPY_JOB'
2020-05-16 17:24:52 [scrapy
How can I pull the job id within the method?

In the spider constructor(inside init),
add the line -->
self.jobId = kwargs.get('_job')
then in the parse function pass this in item,
def parse(self, response):
data = {}
.............
yield data['_job']
in the pipeline add this -->
def process_item(self, item, spider):
self.jobId = item['jobId']
.......
def close_spider(self, spider):
print(self.jobId)
......

Related

Cannot update dictionary

I have a dict_ class that attempts to copy the built-in dict class.
Here is that class (the new function returns the original object):
class dict_:
def __init__(self, *args, **kwargs):
self.kv = kwargs
if not self.kv:
for kv in args:
for k, v in kv:
self.kv.update({k: v})
def __str__(self):
return "%s" % self.kv
def __getitem__(self, item):
return self.kv[item]
def update(self, *args):
self.kv.update(args)
I've called it like this:
from Dodger.dodger import *
term = new(System())
a = new(dict_(a=1, b=2))
a.update(new(dict_(c=3)))
term.println(a)
This is supposed to modify a to {"a": 1, "b": 2, "c": 3} but instead it gives me this error:
Traceback (most recent call last):
File "C:/free_time/Dodger/dodger_test.py", line 5, in <module>
File "C:\free_time\Dodger\dodger.py", line 176, in update
File "C:\free_time\Dodger\dodger.py", line 173, in __getitem__
KeyError: 0
Why is it giving a KeyError? What does the 0 mean? (I am using python 3.8.2)
I figured out how to solve this problem. I just have to implement __setitem__ too:
def __setitem__(self, key, value):
"""Set self[key] to value"""
self.kv[key] = value

Error downloading PDF files

I have the following (simplified) code:
import os
import scrapy
class TestSpider(scrapy.Spider):
name = 'test_spider'
start_urls = ['http://www.pdf995.com/samples/pdf.pdf', ]
def parse(self, response):
save_path = 'test'
file_name = 'test.pdf'
self.save_page(response, save_path, file_name)
def save_page(self, response, save_dir, file_name):
os.makedirs(save_dir, exist_ok=True)
with open(os.path.join(save_dir, file_name), 'wb') as afile:
afile.write(response.body)
When i run it, I get this error:
[scrapy.core.scraper] ERROR: Error downloading <GET http://www.pdf995.com/samples/pdf.pdf>
Traceback (most recent call last):
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "C:\Python36\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1278, in returnValue
raise _DefGen_Return(val)
twisted.internet.defer._DefGen_Return: <200 http://www.pdf995.com/samples/pdf.pdf>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "C:\Python36\lib\site-packages\scrapy\core\downloader\middleware.py", line 53, in process_response
spider=spider)
File "C:\Python36\lib\site-packages\scrapy_beautifulsoup\middleware.py", line 16, in process_response
return response.replace(body=str(BeautifulSoup(response.body, self.parser)))
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 79, in replace
return cls(*args, **kwargs)
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 20, in __init__
self._set_body(body)
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 55, in _set_body
"Response body must be bytes. "
TypeError: Response body must be bytes. If you want to pass unicode body use TextResponse or HtmlResponse.
Do I need to introduce a middleware or something to handle this? This looks like it should be valid, at least by other examples.
Note: at the moment I'm not using a pipeline because there in my real spider I have a lot of checks on whether the related item has been scraped, validating if this pdf belongs to the item, and checking a custom name of a pdf to see if it was downloaded. And as mentioned, many samples did what I'm doing here so I thought it would be easier and work.
The issue because of your own scrapy_beautifulsoup\middleware.py which is trying to replace the return response.replace(body=str(BeautifulSoup(response.body, self.parser))).
You need to correct that and that should fix the issue

Error saving crawled page using file_urls and ITEM_PIPELINES: Missing scheme in request url: h

I'm trying to have scrapy download a copy of each page it crawls but when I run my spider the log contains entries like
2016-06-20 15:39:12 [scrapy] ERROR: Error processing {'file_urls': 'http://example.com/page',
'title': u'PageTitle'}
Traceback (most recent call last):
File "c:\anaconda3\envs\scrapy\lib\site-packages\twisted\internet\defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\pipelines\media.py", line 44, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\pipelines\files.py", line 365, in get_media_requests
return [Request(x) for x in item.get(self.files_urls_field, [])]
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\http\request\__init__.py", line 25, in __init__
self._set_url(url)
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\http\request\__init__.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
Other questions in SO about this error seem to relate to problems with the start_urls but my start url is fine as the spider crawls across the site, it just doesn't save the page to my specified files_store.
I populate the file_urls using item['file_urls'] = response.url
Do I need to specify the url a different way?

Creating module via web-interface

I followed the following steps to create a module named "x_test" :
1. Settings -->Technical --> Database Structure --> Models --> Create
Model Description : Test
Model :x_test
2. Add Fields
Name: x_sample
Field Label: sample
Field Type: boolean
3. Save
4. Click on Create a Menu
5. Select appropriate menu and click on "CREATE MENU"
6. Click on the menu provided and try to provide values for my custom module
7. When I "SAVE" my record, I am getting the following error:
Traceback (most recent call last):
File "/var/app/openerp/server/openerp/netsvc.py", line 292, in dispatch_rpc
result = ExportService.getService(service_name).dispatch(method, params)
File "/var/app/openerp/server/openerp/service/web_services.py", line 626, in dispatch
res = fn(db, uid, *params)
File "/var/app/openerp/server/openerp/osv/osv.py", line 188, in execute_kw
return self.execute(db, uid, obj, method, *args, **kw or {})
File "/var/app/openerp/server/openerp/osv/osv.py", line 131, in wrapper
return f(self, dbname, *args, **kwargs)
File "/var/app/openerp/server/openerp/osv/osv.py", line 197, in execute
res = self.execute_cr(cr, uid, obj, method, *args, **kw)
File "/var/app/openerp/server/openerp/osv/osv.py", line 185, in execute_cr
return getattr(object, method)(cr, uid, *args, **kw)
File "/var/app/openerp/server/openerp/osv/orm.py", line 4434, in create
cr.execute('insert into "'+self._table+'" (id'+upd0+") values ("+str(id_new)+upd1+')', tuple(upd2))
File "/var/app/openerp/server/openerp/sql_db.py", line 161, in wrapper
return f(self, *args, **kwargs)
File "/var/app/openerp/server/openerp/sql_db.py", line 228, in execute
res = self._obj.execute(query, params)
ProgrammingError: column "x_sample" of relation "x_test" does not exist
LINE 1: insert into "x_test" (id,"x_sample",create_uid,create_date,wri...
Is there any mistake in my regards of creating a module via Web Interface
I had the same issue, go into the database and within the table x_test create the field you added in the model.
Now as far as I am concerned, it should create the fields for you. But I have had to create the fields manually.

How we use pipelines item in scrapy

I'm new user of scrapy to crawl my websites.I want to store data crawled into mysql database.
myspider.py:
class MininovaSpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = [Rule(SgmlLinkExtractor(allow=('/categorie/.*'),restrict_xpaths=('//div[#id="contLeftNavig"]',)), 'parse_t')]
def parse_t(self, response):
x = HtmlXPathSelector(response)
torrent = Torrent()
torrent['url'] = response.url
torrent['title']=x.select("//h1[#class='infoAneTitre']/text()").extract()
torrent['wilaya'] = x.select("//span[#class='ville_t']/text()").extract()
#torrent['prix'] = x.select("//div[#id='datail_ann']/ul[1]/li[4]/span/text()").extract()
#torrent['surface'] = x.select("//div[#id='datail_ann']/ul[3]/li[1]/span/text()").extract()
torrent['description'] = x.select("//div[#class='box_pad']/text()").extract()
return torrent
and for pipelines.py, i modified and used the example of googldir.So when i run crawl i get this error :
exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
exceptions.TypeError: 'MininovaSpider' object is not subscriptable
pipeline.py:
from scrapy import log
from twisted.enterprise import adbapi
import time
import MySQLdb.cursors
class Pipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='test',
user='root',
passwd='',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
tx.execute("select * from database where url = %s", (item['url'] ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute(\
"insert into database (wilaya,titre, site, lien,resume,timestamp) "
"values (%s, %s, %s, %s,%s,%s)",
(item['wilaya'],
item['title'],
'example.com',item['url'],item['description'],
time.time())
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
and traceback:
Traceback (most recent call last):
File "/usr/lib/python2.7/twisted/internet/defer.py", line 287, in addCallbacks
self._runCallbacks()
File "/usr/lib/python2.7/twisted/internet/defer.py", line 545, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/core/scraper.py", line 208, in _itemproc_finished
item=output, response=response, spider=spider)
File "/usr/lib/python2.7/site-packages/scrapy/utils/signal.py", line 53, in send_catch_log_deferred
*arguments, **named)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/internet/defer.py", line 134, in maybeDeferred
result = f(*args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/xlib/pydispatch/robustapply.py", line 47, in robustApply
return receiver(*arguments, **named)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/feedexport.py", line 177, in item_scraped
slot.exporter.export_item(item)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 109, in export_item
itemdict = dict(self._get_serialized_fields(item))
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 60, in _get_serialized_fields
field_iter = item.iterkeys()
**exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
2012-01-18 16:00:43-0600 [scrapy] Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 503, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 530, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 483, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/python/threadpool.py", line 207, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/twisted/enterprise/adbapi.py", line 448, in _runInteraction
result = interaction(trans, *args, **kw)
File "/opt/scrapy/test/pipelines.py", line 33, in _conditional_insert
tx.execute("select * from database where url = %s", (item['url'] ))
**exceptions.TypeError: 'MininovaSpider' object is not subscriptable
exceptions.TypeError: 'MininovaSpider' object is not subscriptable
Looks like you have yielded somewhere a spider (MininovaSpider) instance instead of an item. I think you have there more code you haven't shown.
In Pipeline.process_item() put this to confirm:
def process_item(self, spider, item):
assert isinstance(item, Torrent), 'Here should be Torrent instance!'
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item