Scrapy keyerror and next page url not working

Scrapy keyerror and next page url not working - scrapy

I am trying to scrape using this page as start url: https://www.imdb.com/lists/tt0237478?ref_=tt_rls_sm
This page has 3 lists and one of the lists has 100+ items.
My code scrapes only 100 items and not fetching data from next page. Please check what is wrong with the code.
import scrapy
from urllib.parse import urljoin
class lisTopSpider(scrapy.Spider):
name= 'ImdbListsSpider'
allowed_domains = ['imdb.com']
start_urls = [
'https://www.imdb.com/lists/tt0237478'
]
def parse(self, response):
listsLinks = response.xpath('//div[2]/strong')
for link in listsLinks:
list_url = response.urljoin(link.xpath('.//a/#href').get())
yield scrapy.Request(list_url, callback=self.parse_list, meta={'list_url': list_url})
next_page_url = response.xpath('//a[#class="flat-button next-page "]/#href').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_list(self, response):
list_url = response.meta['list_url']
titles = response.xpath('//h3/a/#href').getall()
next_page_url = response.xpath('//a[#class="flat-button lister-page-next next-page"]/#href').get()
if next_page_url is not None:
next_page_url = urljoin('https://www.imdb.com',next_page_url)
print('here is next page url')
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse_list)
yield{
'listurl': list_url,
'titles': titles,
}
Here is the error
2020-05-06 21:09:29 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.imdb.com/list/ls055923961/?page=2> (referer: https://www.imdb.com/list/ls055923961/)
Traceback (most recent call last):
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37,
in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python projects\scrapy\imdb_project\virenv\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Python Projects\Scrapy\imdb_project\imdb_project\spiders\TopLists.py", line 29, in parse_list
list_url = response.meta['list_url']
KeyError: 'list_url'

You are using Request.meta to provide the list_url to your parse_list-method in your parse-method but you forgot to use it in your Request-call inside of parse_list for the next pages.
Simply add meta={'list_url': list_url} to your Request inside parse_list and it should work fine.
So the handling of next pages in parse_list should look like this:
if next_page_url is not None:
next_page_url = urljoin('https://www.imdb.com', next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse_list, meta={'list_url': list_url})
Btw: After Scrapy 1.7 the preferred way of handling user information is now Request.cb_kwargs (see "Caution"-part in the official docu here)

Related

Error when using Tensorflow bucket_by_sequence_length() and tf.py_function() together

I have the following sample code:
import glob
import random
import tensorflow as tf
import cv2
def random_blur(image):
# do stuff which can't be done with a tf.image function...
random_x_blur = random.randint(1, 3)
random_y_blur = random.randint(1, 3)
return cv2.blur(image, (random_x_blur, random_y_blur))
def transform(image):
image = tf.image.random_jpeg_quality(image, 75, 100)
image = tf.image.random_brightness(image, 0.5)
image = tf.image.random_contrast(image, 0.2, 0.5)
# here's the problem...
# image = tf.py_function(func=random_blur, inp=[image], Tout=tf.uint8)
return image
def process_path(file_path):
image = tf.io.read_file(file_path)
image = tf.image.decode_png(image, channels=1)
return transform(image), image
train_directory = 'data/small/'
train_files = glob.glob(train_directory + '*.png')
ds_train = tf.data.Dataset.from_tensor_slices(train_files)
boundaries = [100, 200, 300, 400]
batch_sizes = [16, 16, 16, 16, 16]
ds_train = ds_train.map(process_path, 4)
ds_train = ds_train.bucket_by_sequence_length(element_length_func=lambda x, y: tf.shape(x)[1],
bucket_boundaries=boundaries,
bucket_batch_sizes=batch_sizes)
I'm trying to create a Tensorflow Dataset from variable-width 60px-high images in a directory, using the bucket_by_sequence_length() function to ensure the images in each minibatch have the same dimensions. This all works fine until I uncomment the line beneath "here's the problem" in the code above. When you uncomment that and run it, it produces the following error:
Traceback (most recent call last):
File "test.py", line 34, in <module>
ds_train = ds_train.bucket_by_sequence_length(element_length_func=lambda x, y: tf.shape(x)[1],
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 3120, in bucket_by_sequence_length
return self.group_by_window(
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2976, in group_by_window
return _GroupByWindowDataset(
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 5841, in __init__
self._make_reduce_func(reduce_func, input_dataset)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 5890, in _make_reduce_func
self._reduce_func = structured_function.StructuredFunctionWrapper(
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/structured_function.py", line 271, in __init__
self._function = fn_factory()
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/eager/function.py", line 2610, in get_concrete_function
graph_function = self._get_concrete_function_garbage_collected(
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/eager/function.py", line 2576, in _get_concrete_function_garbage_collected
graph_function, _ = self._maybe_define_function(args, kwargs)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/eager/function.py", line 2760, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/eager/function.py", line 2670, in _create_graph_function
func_graph_module.func_graph_from_py_func(
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/framework/func_graph.py", line 1247, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/structured_function.py", line 248, in wrapped_fn
ret = wrapper_helper(*args)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/structured_function.py", line 177, in wrapper_helper
ret = autograph.tf_convert(self._func, ag_ctx)(*nested_args)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/autograph/impl/api.py", line 689, in wrapper
return converted_call(f, args, kwargs, options=options)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/autograph/impl/api.py", line 377, in converted_call
return _call_unconverted(f, args, kwargs, options)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/autograph/impl/api.py", line 458, in _call_unconverted
return f(*args, **kwargs)
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 3111, in batching_fn
shapes = make_padded_shapes(
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 3083, in make_padded_shapes
shape = [
File "/Users/garnet/Library/Python/3.8/lib/python/site-packages/tensorflow/python/framework/tensor_shape.py", line 882, in __iter__
raise ValueError("Cannot iterate over a shape with unknown rank.")
ValueError: Cannot iterate over a shape with unknown rank.
Likewise, my code works fine if I uncomment that line but remove the call to bucket_by_sequence_length() and limit my training data to images with identical dimensions.
It seems that bucket_by_sequence_length() and tf.py_function() don't play nice together, even with eager mode enabled. I need to do some image augmentations/transformations which the standard tf.image functions don't provide. Any ideas?

Odoo crashes when creating several records at the same time

I have created a method where I import a csv file and I create new records with the data it contains. The method works with as csv with around 1000 lines. (The method can create around 1000 new records), but when the csv file have more than 1200 lines, Odoo server crashes and I have to restart the server. Here is my method and the odoo log
#api.multi
#profile
def action_import_csv(self):
cuenta = 0
self._chequear_extension_csv(self.archivo_filename)
res = base64.b64decode(self.archivo)
text = res.decode("UTF-8")
reader = csv.DictReader(io.StringIO(text))
brigadista = self.env['utepda_brigadas.brigadista']
brigada = self.env['utepda_brigadas.brigada']
brigada_cr = self.env.cr
brigadista_brigada = self.env['utepda_brigadas.brigadista_brigada']
count = 0
campos = [
'BRIGADA', 'NOMBRE', 'CEDULA', 'CARGO', 'JORNALES', 'COSTO_JORNADA'
]
rows = list(reader)
totalrows = len(rows)
print("La cantidad de filas es {}".format(totalrows))
for index,row in enumerate(rows):
print("Index -> {} -> {}".format(index,row))
if count == 0:
count = 1
self._chequear_campos_csv(campos, row.keys())
else:
codigo_brigada = row['BRIGADA']
nombre = row['NOMBRE']
cedula = row['CEDULA'].replace('-', '')
cargo = row['CARGO']
dias_trabajados = row['JORNALES']
total_jornada = row['COSTO_JORNADA']
fecha = self.fecha
total = int(dias_trabajados) * int(total_jornada)
existe_brigadista = brigadista.search([['cedula', '=', cedula]],limit=1)
brigada_actual = brigada.search([['codigo', '=', codigo_brigada]],limit=1)
#brigada_cr.execute(
# "SELECT id FROM public.utepda_brigada_brigada WHERE codigo=%s"
# % codigo_brigada)
#res = brigada_cr.fetchone()
#brigada_actual = brigada.search([['codigo', '=', codigo_brigada]],
#limit=1)
#brigada_actual = res[0]
if not existe_brigadista.id:
new = {
'name':
nombre,
'cedula':
cedula,
'cargo':
cargo,
'estado':
"nuevo",
'brigada_ids': [(0, _, {
'fecha': fecha,
'dias_trabajados': dias_trabajados,
'total': total,
'brigada_id': brigada_actual.id
})]
}
nuevo_brigadista = brigadista.create(new)
if nuevo_brigadista.id:
cuenta = cuenta+1
print("""{} Se ha insertado el brigadista {} con nombre {}""".format(cuenta,nuevo_brigadista.id,
nuevo_brigadista.name))
#existe_brigadista = existe_brigadista[0]
else:
existe_brigadista.write({
'estado':
"reportado",
'brigada_ids': [(0, _, {
'fecha': fecha,
'dias_trabajados': dias_trabajados,
'total': total,
'brigada_id': brigada_actual.id
})]
})
fecha_format = datetime.strftime(self.fecha, '%m/%Y')
no_reportados = brigadista.search(
[['fecha_ultimo_reporte', '!=', fecha_format]])
no_reportados.write({'estado': "no_reportado"})
return {
'type': 'ir.actions.client',
'tag': 'reload',
}
Here is the Odoo log
2020-11-04 11:44:23,285 15044 WARNING odoo odoo.service.server: Thread <Thread(odoo.service.http.request.140355723687680, started 140355723687680)> virtual real time limit (152/120s) reached.
2020-11-04 11:44:23,289 15044 INFO odoo odoo.service.server: Dumping stacktrace of limit exceeding threads before reloading
2020-11-04 11:44:23,520 15044 INFO odoo odoo.tools.misc:
# Thread: <Thread(odoo.service.http.request.140355723687680, started 140355723687680)> (db:odoo) (uid:2) (url:http://localhost:8069/web/dataset/call_button)
File: "/home/ernesto/.vscode/extensions/ms-python.python-2020.5.86806/pythonFiles/lib/python/debugpy/no_wheels/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 823, in __call__
ret = self.original_func(*self.args, **self.kwargs)
File: "/usr/lib/python3.6/threading.py", line 884, in _bootstrap
self._bootstrap_inner()
File: "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File: "/usr/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File: "/usr/lib/python3.6/socketserver.py", line 654, in process_request_thread
self.finish_request(request, client_address)
File: "/usr/lib/python3.6/socketserver.py", line 364, in finish_request
self.RequestHandlerClass(request, client_address, self)
File: "/usr/lib/python3.6/socketserver.py", line 724, in __init__
self.handle()
File: "/usr/local/lib/python3.6/dist-packages/werkzeug/serving.py", line 228, in handle
rv = BaseHTTPRequestHandler.handle(self)
File: "/usr/lib/python3.6/http/server.py", line 418, in handle
self.handle_one_request()
File: "/usr/local/lib/python3.6/dist-packages/werkzeug/serving.py", line 263, in handle_one_request
return self.run_wsgi()
File: "/usr/local/lib/python3.6/dist-packages/werkzeug/serving.py", line 205, in run_wsgi
execute(self.server.app)
File: "/usr/local/lib/python3.6/dist-packages/werkzeug/serving.py", line 193, in execute
application_iter = app(environ, start_response)
File: "/home/ernesto/odoo12/odoo/service/server.py", line 434, in app
return self.app(e, s)
File: "/home/ernesto/odoo12/odoo/service/wsgi_server.py", line 142, in application
return application_unproxied(environ, start_response)
File: "/home/ernesto/odoo12/odoo/service/wsgi_server.py", line 117, in application_unproxied
result = odoo.http.root(environ, start_response)
File: "/home/ernesto/odoo12/odoo/http.py", line 1320, in __call__
return self.dispatch(environ, start_response)
File: "/home/ernesto/odoo12/odoo/http.py", line 1293, in __call__
return self.app(environ, start_wrapped)
File: "/usr/local/lib/python3.6/dist-packages/werkzeug/wsgi.py", line 599, in __call__
return self.app(environ, start_response)
File: "/home/ernesto/odoo12/odoo/http.py", line 1488, in dispatch
result = ir_http._dispatch()
File: "/home/ernesto/odoo12/addons/auth_signup/models/ir_http.py", line 19, in _dispatch
return super(Http, cls)._dispatch()
File: "/home/ernesto/odoo12/addons/web_editor/models/ir_http.py", line 22, in _dispatch
return super(IrHttp, cls)._dispatch()
File: "/home/ernesto/odoo12/odoo/addons/base/models/ir_http.py", line 203, in _dispatch
result = request.dispatch()
File: "/home/ernesto/odoo12/odoo/http.py", line 698, in dispatch
result = self._call_function(**self.params)
File: "/home/ernesto/odoo12/odoo/http.py", line 346, in _call_function
return checked_call(self.db, *args, **kwargs)
File: "/home/ernesto/odoo12/odoo/service/model.py", line 98, in wrapper
return f(dbname, *args, **kwargs)
File: "/home/ernesto/odoo12/odoo/http.py", line 339, in checked_call
result = self.endpoint(*a, **kw)
File: "/home/ernesto/odoo12/odoo/http.py", line 941, in __call__
return self.method(*args, **kw)
File: "/home/ernesto/odoo12/odoo/http.py", line 519, in response_wrap
response = f(*args, **kw)
File: "/home/ernesto/odoo12/addons/web/controllers/main.py", line 966, in call_button
action = self._call_kw(model, method, args, {})
File: "/home/ernesto/odoo12/addons/web/controllers/main.py", line 954, in _call_kw
return call_kw(request.env[model], method, args, kwargs)
File: "/home/ernesto/odoo12/odoo/api.py", line 759, in call_kw
return _call_kw_multi(method, model, args, kwargs)
File: "/home/ernesto/odoo12/odoo/api.py", line 746, in _call_kw_multi
result = method(recs, *args, **kwargs)
File: "<decorator-gen-127>", line 2, in action_import_csv
File: "/home/ernesto/odoo12/odoo/tools/profiler.py", line 128, in _odooProfile
result = method(*args, **kwargs)
File: "/home/ernesto/odoo12/extra_addons/utepda_brigadas/models/model_wizard.py", line 81, in action_import_csv
nuevo_brigadista = brigadista.create(new)
File: "<decorator-gen-111>", line 2, in create
File: "/home/ernesto/odoo12/odoo/api.py", line 461, in _model_create_multi
return create(self, [arg])
File: "/home/ernesto/odoo12/addons/mail/models/mail_thread.py", line 278, in create
thread._message_log(body=_('%s created') % doc_name)
File: "/home/ernesto/odoo12/addons/mail/models/mail_thread.py", line 2230, in _message_log
message = self.env['mail.message'].sudo().create(message_values)
File: "<decorator-gen-107>", line 2, in create
File: "/home/ernesto/odoo12/odoo/api.py", line 440, in _model_create_single
return create(self, arg)
File: "/home/ernesto/odoo12/addons/mail/models/mail_message.py", line 990, in create
message = super(Message, self).create(values)
File: "<decorator-gen-3>", line 2, in create
File: "/home/ernesto/odoo12/odoo/api.py", line 461, in _model_create_multi
return create(self, [arg])
File: "/home/ernesto/odoo12/odoo/models.py", line 3583, in create
records = self._create(data_list)
File: "/home/ernesto/odoo12/odoo/models.py", line 3669, in _create
col_val = field.convert_to_column(val, self, stored)
File: "/home/ernesto/odoo12/odoo/fields.py", line 1555, in convert_to_column
strip_classes=self.strip_classes)
File: "/home/ernesto/odoo12/odoo/tools/mail.py", line 227, in html_sanitize
cleaned = cleaner.clean_html(src)
File: "/usr/local/lib/python3.6/dist-packages/lxml/html/clean.py", line 517, in clean_html
doc = fromstring(html)
File: "/usr/local/lib/python3.6/dist-packages/lxml/html/__init__.py", line 876, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File: "/usr/local/lib/python3.6/dist-packages/lxml/html/__init__.py", line 762, in document_fromstring
value = etree.fromstring(html, parser, **kw)
2020-11-04 11:44:23,522 15044 INFO odoo odoo.service.server: Initiating server reload

It's probably timing out due to the sheer amount of records. Add the following to your config file, but play around with the settings to see what works best for you.
--limit-time-real 10000
limit_time_real = 480
I would also increase workers if you can to not slow down the Odoo
https://www.odoo.com/documentation/14.0/setup/deploy.html#builtin-server

Scrapy Selenium geckodriver problem - error while trying to scrape

Unhandled error in Deferred: 2020-07-24 09:12:40 [twisted] CRITICAL: Unhandled error in Deferred:
Traceback (most recent call last): File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/crawler.py", line 192, in crawl return self._crawl(crawler, *args, **kwargs) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/crawler.py", line 196, in _crawl d = crawler.crawl(*args, **kwargs) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 1613, in unwindGenerator return _cancellableInlineCallbacks(gen) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 1529, in _cancellableInlineCallbacks
_inlineCallbacks(None, g, status)
--- --- File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks result = g.send(result) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/crawler.py", line 87, in crawl self.engine = self._create_engine() File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/crawler.py", line 101, in _create_engine return ExecutionEngine(self, lambda _: self.stop()) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/core/engine.py", line 69, in init self.downloader = downloader_cls(crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/core/downloader/init.py", line 83, in init self.middleware = DownloaderMiddlewareManager.from_crawler(crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/middleware.py", line 53, in from_crawler return cls.from_settings(crawler.settings, crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/middleware.py", line 35, in from_settings mw = create_instance(mwcls, settings, crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/utils/misc.py", line 150, in create_instance instance = objcls.from_crawler(crawler,
*args, **kwargs) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy_selenium/middlewares.py", line 67, in from_crawler middleware = cls( File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy_selenium/middlewares.py", line 43, in init for argument in driver_arguments: builtins.TypeError: 'NoneType' object is not iterable
2020-07-24 09:12:40 [twisted] CRITICAL: Traceback (most recent call last): File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks result = g.send(result) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/crawler.py", line 87, in crawl self.engine = self._create_engine() File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/crawler.py", line 101, in _create_engine return ExecutionEngine(self, lambda _: self.stop()) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/core/engine.py", line 69, in init self.downloader = downloader_cls(crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/core/downloader/init.py", line 83, in init self.middleware = DownloaderMiddlewareManager.from_crawler(crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/middleware.py", line 53, in from_crawler return cls.from_settings(crawler.settings, crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/middleware.py", line 35, in from_settings mw = create_instance(mwcls, settings, crawler) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy/utils/misc.py", line 150, in create_instance instance = objcls.from_crawler(crawler,
*args, **kwargs) File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy_selenium/middlewares.py", line 67, in from_crawler middleware = cls( File "/home/baku/Dev/workspace/moje-python/scrape_linkedin/venv/lib/python3.8/site-packages/scrapy_selenium/middlewares.py", line 43, in init for argument in driver_arguments: TypeError: 'NoneType' object is not iterable
my settings.py
from shutil import which
SELENIUM_DRIVER_NAME = 'firefox'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('geckodriver')
SELENIUM_BROWSER_EXECUTABLE_PATH = which('firefox')
...
'scrapy_selenium.SeleniumMiddleware': 800,
looks like permissions for driver are good:
:/usr/local/bin$ ll | grep gecko
-rwxrwxrwx 1 baku baku 7008696 lip 24 09:09 geckodriver*
crawler code:
class LinkedInProfileSeleniumSpider(scrapy.Spider):
name = 'lips'
allowed_domains = ['www.linkedin.com']
def start_requests(self):
yield SeleniumRequest(
url="https://www.linkedin.com/login/",
callback=self.proceed_login,
wait_until=(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#username")
)
),
script='window.scrollTo(0, document.body.scrollHeight);',
wait_time=30
)
def proceed_login(self, response):
# AFTER LOGIN
driver = response.request.meta['driver']
...
can you please help why it's failing? thanks!
( btw it works with chrome drivers, fails with gecko )
The same problem I had on mac, this one I am trying on ubuntu machine.
Not sure what can be the issue, there to debug etc.
I does not even land into self.proceed_login. Fails on first request.

The error when developing an Odoo 9 custom module?

I'm try to code an Odoo 9 module that inherit other module, when I try to install the new module, this error came out:
Odoo Server Error
Traceback (most recent call last):
File "/opt/odoo/openerp/http.py", line 648, in _handle_exception
return super(JsonRequest, self)._handle_exception(exception)
File "/opt/odoo/openerp/http.py", line 685, in dispatch
result = self._call_function(**self.params)
File "/opt/odoo/openerp/http.py", line 321, in _call_function
return checked_call(self.db, *args, **kwargs)
File "/opt/odoo/openerp/service/model.py", line 118, in wrapper
return f(dbname, *args, **kwargs)
File "/opt/odoo/openerp/http.py", line 314, in checked_call
result = self.endpoint(*a, **kw)
File "/opt/odoo/openerp/http.py", line 964, in __call__
return self.method(*args, **kw)
File "/opt/odoo/openerp/http.py", line 514, in response_wrap
response = f(*args, **kw)
File "/opt/odoo/addons/web/controllers/main.py", line 892, in call_button
action = self._call_kw(model, method, args, {})
File "/opt/odoo/addons/web/controllers/main.py", line 880, in _call_kw
return getattr(request.registry.get(model), method)(request.cr, request.uid, *args, **kwargs)
File "/opt/odoo/openerp/api.py", line 250, in wrapper
return old_api(self, *args, **kwargs)
File "/opt/odoo/openerp/addons/base/module/wizard/base_module_upgrade.py", line 87, in upgrade_module
openerp.modules.registry.RegistryManager.new(cr.dbname, update_module=True)
File "/opt/odoo/openerp/modules/registry.py", line 386, in new
openerp.modules.load_modules(registry._db, force_demo, status, update_module)
File "/opt/odoo/openerp/modules/loading.py", line 338, in load_modules
loaded_modules, update_module)
File "/opt/odoo/openerp/modules/loading.py", line 237, in load_marked_modules
loaded, processed = load_module_graph(cr, graph, progressdict, report=report, skip_modules=loaded_modules, perform_checks=perform_checks)
File "/opt/odoo/openerp/modules/loading.py", line 123, in load_module_graph
load_openerp_module(package.name)
File "/opt/odoo/openerp/modules/module.py", line 331, in load_openerp_module
__import__('openerp.addons.' + module_name)
File "/opt/odoo/openerp/modules/module.py", line 61, in load_module
mod = imp.load_module('openerp.addons.' + module_part, f, path, descr)
File "/opt/odoo/addons/old_residual/__init__.py", line 3, in <module>
from . import models
File "/opt/odoo/addons/old_residual/models/__init__.py", line 3, in <module>
from . import old_residual
File "/opt/odoo/addons/old_residual/models/old_residual.py", line 14
for invoice in self:
^
IndentationError: expected an indented block**

After the function declaration you should provide intent, ie
#api.multi
def _compute_old_residual(self):
for invoice in self:
invs = self.search([('state', '=', 'open'), ('partner_id', '=', invoice.partner_id.id)])

# -*- coding: utf-8 -*-
from openerp import models, fields, api
class old_residual(models.Model):
_inherit = "account.invoice"
old_residual = fields.Monetary(string='Nợ cũ',currency_field='company_currency_id', compute='_compute_o$
#api.multi
def _compute_old_residual(self):
for invoice in self:
invs = self.search([('state', '=', 'open'), ('partner_id', '=', invoice.partner_id.id)])
out_invoice = 0
in_invoice = 0
out_refund = 0
in_refund = 0
for inv in invs:
if inv.type == 'out_invoice':
out_invoice += inv.residual
if inv.type == 'in_invoice':
in_invoice += inv.residual
if inv.type == 'out_refund':
out_refund += inv.residual
if inv.type == 'in_refund':
in_refund += inv.residual
invoice.old_residual = out_invoice + in_refund - in_invoice - out_refund - invoice.amount_total

How we use pipelines item in scrapy

I'm new user of scrapy to crawl my websites.I want to store data crawled into mysql database.
myspider.py:
class MininovaSpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = [Rule(SgmlLinkExtractor(allow=('/categorie/.*'),restrict_xpaths=('//div[#id="contLeftNavig"]',)), 'parse_t')]
def parse_t(self, response):
x = HtmlXPathSelector(response)
torrent = Torrent()
torrent['url'] = response.url
torrent['title']=x.select("//h1[#class='infoAneTitre']/text()").extract()
torrent['wilaya'] = x.select("//span[#class='ville_t']/text()").extract()
#torrent['prix'] = x.select("//div[#id='datail_ann']/ul[1]/li[4]/span/text()").extract()
#torrent['surface'] = x.select("//div[#id='datail_ann']/ul[3]/li[1]/span/text()").extract()
torrent['description'] = x.select("//div[#class='box_pad']/text()").extract()
return torrent
and for pipelines.py, i modified and used the example of googldir.So when i run crawl i get this error :
exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
exceptions.TypeError: 'MininovaSpider' object is not subscriptable
pipeline.py:
from scrapy import log
from twisted.enterprise import adbapi
import time
import MySQLdb.cursors
class Pipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='test',
user='root',
passwd='',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
tx.execute("select * from database where url = %s", (item['url'] ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute(\
"insert into database (wilaya,titre, site, lien,resume,timestamp) "
"values (%s, %s, %s, %s,%s,%s)",
(item['wilaya'],
item['title'],
'example.com',item['url'],item['description'],
time.time())
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
and traceback:
Traceback (most recent call last):
File "/usr/lib/python2.7/twisted/internet/defer.py", line 287, in addCallbacks
self._runCallbacks()
File "/usr/lib/python2.7/twisted/internet/defer.py", line 545, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/core/scraper.py", line 208, in _itemproc_finished
item=output, response=response, spider=spider)
File "/usr/lib/python2.7/site-packages/scrapy/utils/signal.py", line 53, in send_catch_log_deferred
*arguments, **named)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/internet/defer.py", line 134, in maybeDeferred
result = f(*args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/xlib/pydispatch/robustapply.py", line 47, in robustApply
return receiver(*arguments, **named)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/feedexport.py", line 177, in item_scraped
slot.exporter.export_item(item)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 109, in export_item
itemdict = dict(self._get_serialized_fields(item))
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 60, in _get_serialized_fields
field_iter = item.iterkeys()
**exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
2012-01-18 16:00:43-0600 [scrapy] Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 503, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 530, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 483, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/python/threadpool.py", line 207, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/twisted/enterprise/adbapi.py", line 448, in _runInteraction
result = interaction(trans, *args, **kw)
File "/opt/scrapy/test/pipelines.py", line 33, in _conditional_insert
tx.execute("select * from database where url = %s", (item['url'] ))
**exceptions.TypeError: 'MininovaSpider' object is not subscriptable

exceptions.TypeError: 'MininovaSpider' object is not subscriptable
Looks like you have yielded somewhere a spider (MininovaSpider) instance instead of an item. I think you have there more code you haven't shown.
In Pipeline.process_item() put this to confirm:
def process_item(self, spider, item):
assert isinstance(item, Torrent), 'Here should be Torrent instance!'
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Scrapy keyerror and next page url not working - scrapy

Related

Error when using Tensorflow bucket_by_sequence_length() and tf.py_function() together

Odoo crashes when creating several records at the same time

Scrapy Selenium geckodriver problem - error while trying to scrape

The error when developing an Odoo 9 custom module?

How we use pipelines item in scrapy

Categories

Resources