indeed scrapy can't retrieve data - scrapy

I run this command scrapy crawl indeed --logfile=crawl.log But no logfile is generated and I received the following error. I tried to debug the code, can't see nothing.
enter code here`File "C:\Anaconda3\Scripts\scrapy-script.py", line 10, in <module>
sys.exit(execute())
File "C:\Anaconda3\lib\site-packages\scrapy\cmdline.py", line 110, in execute
settings = get_project_settings()
File "C:\Anaconda3\lib\site-packages\scrapy\utils\project.py", line 68, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "C:\Anaconda3\lib\site-packages\scrapy\settings\__init__.py", line 295, in setmodule
self.set(key, getattr(module, key), priority)
File "C:\Anaconda3\lib\site-packages\scrapy\settings\__init__.py", line 270, in set
self.attributes[name].set(value, priority)
File "C:\Anaconda3\lib\site-packages\scrapy\settings\__init__.py", line 55, in set
value = BaseSettings(value, priority=priority)
File "C:\Anaconda3\lib\site-packages\scrapy\settings\__init__.py", line 91, in __init__
self.update(values, priority)
File "C:\Anaconda3\lib\site-packages\scrapy\settings\__init__.py", line 327, in update
for name, value in six.iteritems(values):
File "C:\Anaconda3\lib\site-packages\six.py", line 587, in iteritems
return iter(d.items(**kw))
AttributeError: 'list' object has no attribute 'items'

Related

pyshark "TypeError: sequence item 6: expected str instance, _io.TextIOWrapper found"

I am using pyshark for live packet capture. when I pass a parameter output_file = myFilObject for saving captures to a file,
getting following error on sniffing line. If output_file parameter is removed, this works absolutely fine. Please suggest.
MySampleCode:
import pyshark
def capturePacket():
outputF = open('capturepcap.pcap', 'w')
cap = pyshark.LiveCapture(interface='Ethernet 8', output_file=outputF)
cap.sniff(timeout=60)
outputF.close()
Error:
Traceback (most recent call last):
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "c:\Users\wxyz\.vscode\extensions\ms-python.python-2022.6.2\pythonFiles\lib\python\debugpy\__main__.py", line 45, in <module>
cli.main()
File "c:\Users\wxyz\.vscode\extensions\ms-python.python-2022.6.2\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 444, in main
run()
File "c:\Users\wxyz\.vscode\extensions\ms-python.python-2022.6.2\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 285, in run_file
runpy.run_path(target_as_str, run_name=compat.force_str("__main__"))
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 269, in run_path
return _run_module_code(code, init_globals, run_name,
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 96, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "c:\Users\wxyz\Documents\automation\practice_set_script\paket_capture\basic_packetCapture.py", line 29, in <module>
capturePacket()
File "c:\Users\wxyz\Documents\automation\practice_set_script\paket_capture\basic_packetCapture.py", line 22, in capturePacket
cap.sniff(timeout=60)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\site-packages\pyshark\capture\capture.py", line 137, in load_packets
self.apply_on_packets(keep_packet, timeout=timeout, packet_count=packet_count)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\site-packages\pyshark\capture\capture.py", line 274, in apply_on_packets
return self.eventloop.run_until_complete(coro)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 641, in run_until_complete
return future.result()
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\asyncio\tasks.py", line 445, in wait_for
return fut.result()
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\site-packages\pyshark\capture\capture.py", line 283, in packets_from_tshark
tshark_process = await self._get_tshark_process(packet_count=packet_count)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\site-packages\pyshark\capture\live_capture.py", line 94, in _get_tshark_process
tshark = await super(LiveCapture, self)._get_tshark_process(packet_count=packet_count, stdin=read)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\site-packages\pyshark\capture\capture.py", line 399, in _get_tshark_process
self._log.debug("Creating TShark subprocess with parameters: " + " ".join(parameters))
TypeError: sequence item 6: expected str instance, _io.TextIOWrapper found
Error on reading from the event loop self pipe
loop: <ProactorEventLoop running=True closed=False debug=False>
Traceback (most recent call last):
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\asyncio\proactor_events.py", line 779, in _loop_self_reading
f = self._proactor.recv(self._ssock, 4096)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\asyncio\windows_events.py", line 450, in recv
self._register_with_iocp(conn)
File "C:\Users\wxyz\AppData\Local\Programs\Python\Python310\lib\asyncio\windows_events.py", line 723, in _register_with_iocp
_overlapped.CreateIoCompletionPort(obj.fileno(), self._iocp, 0, 0)
OSError: [WinError 87] The parameter is incorrect
PS C:\Users\wxyz\Documents\automation\practice_set_script\paket_capture>
The issue in your code is these lines:
outputF = open('capturepcap.pcap', 'w')
cap = pyshark.LiveCapture(interface='Ethernet 8', output_file=outputF)
The output_file parameter is a string and not a io.TextIOWrapper
:param output_file: A string of a file to write every read packet into (useful when filtering).
So this works:
import pyshark
def capturePacket():
cap = pyshark.LiveCapture(interface='en0', output_file='capturepcap.pcap')
cap.sniff(timeout=60)
capturePacket()
Here is a reference that I put together on using PyShark

pandas 1.3.3 to_feather giving ArrowMemoryError

I have a dataset of size around 270MB and I use the following to write to feather file:
df.reset_index().to_feather(feather_path)
This gives me an error :
File "C:\apps\Python\lib\site-packages\pandas\util\_decorators.py", line 207, in wrapper
return func(*args, **kwargs)
File "C:\apps\Python\lib\site-packages\pandas\core\frame.py", line 2519, in to_feather
to_feather(self, path, **kwargs)
File "C:\apps\Python\lib\site-packages\pandas\io\feather_format.py", line 87, in to_feather
feather.write_feather(df, handles.handle, **kwargs)
File "C:\apps\Python\lib\site-packages\pyarrow\feather.py", line 152, in write_feather
table = Table.from_pandas(df, preserve_index=False)
File "pyarrow\table.pxi", line 1553, in pyarrow.lib.Table.from_pandas
File "C:\apps\Python\lib\site-packages\pyarrow\pandas_compat.py", line 607, in dataframe_to_arrays
arrays[i] = maybe_fut.result()
File "C:\apps\Python\lib\concurrent\futures\_base.py", line 438, in result
return self.__get_result()
File "C:\apps\Python\lib\concurrent\futures\_base.py", line 390, in __get_result
raise self._exception
File "C:\apps\Python\lib\concurrent\futures\thread.py", line 52, in run
result = self.fn(*self.args, **self.kwargs)
File "C:\apps\Python\lib\site-packages\pyarrow\pandas_compat.py", line 575, in convert_column
result = pa.array(col, type=type_, from_pandas=True, safe=safe)
File "pyarrow\array.pxi", line 302, in pyarrow.lib.array
File "pyarrow\array.pxi", line 83, in pyarrow.lib._ndarray_to_array
File "pyarrow\error.pxi", line 114, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: realloc of size 3221225472 failed
Note : This works well in PyCharm. No issues writing the feather file.
But when the python program is called in a Windows batch file like:
call python "myprogram.py"
and when I schedule the batch file in a task using Task Scheduler it fails with above memory error.
PyArrow version is 5.0.0 if that helps.
Any ideas please?

IndentationError: expected an indented block, Scrapy

career#careercrawler:~/stack/stack$ scrapy crawl stack
Traceback (most recent call last): File
"/home/career/.local/bin/scrapy", line 11, in
sys.exit(execute())
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/cmdline.py",
line 141, in execute
cmd.crawler_process = CrawlerProcess(settings)
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/crawler.py",
line 238, in init
super(CrawlerProcess, self).init(settings)
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/crawler.py",
line 129, in init
self.spider_loader = _get_spider_loader(settings)
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/crawler.py",
line 325, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/spiderloader.py",
line 33, in from_settings
return cls(settings)
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/spiderloader.py",
line 20, in init
self._load_all_spiders()
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/spiderloader.py",
line 28, in _load_all_spiders
for module in walk_modules(name):
File
"/home/career/.local/lib/python2.7/site-packages/scrapy/utils/misc.py",
line 71, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/init.py", line 37, in
import_module
import(name)
File "/home/career/stack/stack/spiders/stack_spider.py", line 4, in
from stack.items import StackItem
File "/home/career/stack/stack/items.py", line 13
title = scrapy.Field()
^
IndentationError: expected an indented block
This is my error, I don't know what is happening there. Someone help me, please.
This error is because of intention,
As mentioned in the traceback:
/home/career/stack/stack/items.py", line 13 title = scrapy.Field()
go to ~/stack/stack/items.py and check indentation at line 13.

Scrapy calling spider other than the one specified on the command line

(P6Svenv)malikarumi#Tetuoan2:~/Projects/P6/P6Svenv/test2/test2/spiders$ scrapy crawl zomd
Traceback (most recent call last):
File "/usr/bin/scrapy", line 9, in <module>
load_entry_point('Scrapy==1.0.3.post6-g2d688cd', 'console_scripts', 'scrapy')()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 142, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 209, in __init__
super(CrawlerProcess, self).__init__(settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 115, in __init__
self.spider_loader = _get_spider_loader(settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 296, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "/usr/lib/pymodules/python2.7/scrapy/spiderloader.py", line 30, in from_settings
return cls(settings)
File "/usr/lib/pymodules/python2.7/scrapy/spiderloader.py", line 21, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/malikarumi/Projects/P6/P6Svenv/test2/test2/spiders/t350_crawl.py", line 36
def parse_item(self, response):
^
IndentationError: unindent does not match any outer indentation level
Do you see it? Scrapy isn't even calling the spider I specified on the command line!
I see that super in the traceback, but all my t350's are derived from CrawlSpider. zomd is subclassed from scrapy.Spider. Why is this happening and what do I do about it?
Spider's name doesn't equal to the file name. It is defined within the spider file by the second line below:
class CAPjobSpider(Spider):
name = "spider_name"
The above spider's name is "spider_name", even if the file may be "New_York.py".

Blank sessionid cookie causes error in request.user

An end user somehow ended up with their sessionid cookie blank (as in "sessionid=;"). This causes the following error call stack (below the function call to request.user) when using Django in conjunction with GAE:
File "/src/django/utils/functional.py", line 204, in inner
self._setup()
File "/src/django/utils/functional.py", line 270, in _setup
self._wrapped = self._setupfunc()
File "/src/django/contrib/auth/middleware.py", line 18, in <lambda>
request.user = SimpleLazyObject(lambda: get_user(request))
File "/src/django/contrib/auth/middleware.py", line 10, in get_user
request._cached_user = auth.get_user(request)
File "/src/django/contrib/auth/__init__.py", line 136, in get_user
user_id = request.session[SESSION_KEY]
File "/src/django/contrib/sessions/backends/base.py", line 44, in __getitem__
return self._session[key]
File "/src/django/contrib/sessions/backends/base.py", line 167, in _get_session
self._session_cache = self.load()
File "/src/django/contrib/sessions/backends/cached_db.py", line 39, in load
expire_date__gt=timezone.now()
File "/src/django/db/models/manager.py", line 143, in get
return self.get_query_set().get(*args, **kwargs)
File "/src/django/db/models/query.py", line 398, in get
num = len(clone)
File "/src/django/db/models/query.py", line 106, in __len__
self._result_cache = list(self.iterator())
File "/src/django/db/models/query.py", line 317, in iterator
for row in compiler.results_iter():
File "/src/djangotoolbox/db/basecompiler.py", line 375, in results_iter
results = self.build_query(fields).fetch(
File "/src/djangotoolbox/db/basecompiler.py", line 481, in build_query
query.add_filters(self.query.where)
File "/src/djangotoolbox/db/basecompiler.py", line 174, in add_filters
self.add_filters(child)
File "/src/djangotoolbox/db/basecompiler.py", line 176, in add_filters
field, lookup_type, value = self._decode_child(child)
File "/src/djangotoolbox/db/basecompiler.py", line 216, in _decode_child
lookup_type, value, field, annotation)
File "/src/djangotoolbox/db/basecompiler.py", line 254, in _normalize_lookup_value
return self.ops.value_for_db(value, field, lookup_type)
File "/src/djangoappengine/db/base.py", line 128, in value_for_db
return super_value_for_db(value, field, lookup)
File "/src/djangotoolbox/db/base.py", line 245, in value_for_db
field_kind, db_type, lookup)
File "/src/djangoappengine/db/base.py", line 160, in _value_for_db
raise DatabaseError("Only strings and positive integers "
DatabaseError: Only strings and positive integers may be used as keys on GAE.
This error does not occur if sessionid is set to some invalid non-empty value (such as "session=garbage;"). I think it is related to follow contrast of behavior in a python shell:
>>> Session.objects.filter(session_key='abc').exists()
0
>>> Session.objects.filter(session_key='').exists()
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/src/django/db/models/query.py", line 610, in exists
return self.query.has_results(using=self.db)
File "/src/django/db/models/sql/query.py", line 445, in has_results
return compiler.has_results()
File "/src/dbindexer/compiler.py", line 32, in has_results
return super(SQLCompiler, self).has_results()
File "/src/djangotoolbox/db/basecompiler.py", line 384, in has_results
return self.get_count(check_exists=True)
File "/src/djangotoolbox/db/basecompiler.py", line 468, in get_count
return self.build_query().count(high_mark)
File "/src/djangotoolbox/db/basecompiler.py", line 481, in build_query
query.add_filters(self.query.where)
File "/src/djangotoolbox/db/basecompiler.py", line 174, in add_filters
self.add_filters(child)
File "/src/djangotoolbox/db/basecompiler.py", line 176, in add_filters
field, lookup_type, value = self._decode_child(child)
File "/src/djangotoolbox/db/basecompiler.py", line 216, in _decode_child
lookup_type, value, field, annotation)
File "/src/djangotoolbox/db/basecompiler.py", line 254, in _normalize_lookup_value
return self.ops.value_for_db(value, field, lookup_type)
File "/src/djangoappengine/db/base.py", line 128, in value_for_db
return super_value_for_db(value, field, lookup)
File "/src/djangotoolbox/db/base.py", line 245, in value_for_db
field_kind, db_type, lookup)
File "/src/djangoappengine/db/base.py", line 160, in _value_for_db
raise DatabaseError("Only strings and positive integers "
DatabaseError: Only strings and positive integers may be used as keys on GAE.
Is this a djangoappengine or djangotoolbox bug, or a Django bug? What's the proper way to prevent this error, and consider user unauthenticated?
Ok, I think I may have to add a middleware class to handle this special case, and place it directly after SessionMiddleware:
class EmptySessionMiddleware(object):
def process_request(self, request):
session = request.session
if session.session_key is not None and len(session.session_key) == 0:
logging.info('[EmptySessionMiddleware] setting empty session key to None')
session._session_key = None
It's a weird special case, but basically the problem is that Django session middleware only checks for None session before looking up in db (not empty string), and an empty string primary key query in djangoappengine raises an exception. I'm not sure there's another way to handle this case.