pandas read_csv with storage_options working locally but not in Dataflow - pandas

I am trying to import data from an API into my GBQ and want to use dataflow.
Due to reasons unknown and unimaginable to me, the API merely returns a URL of a ".csv.gz", which I then need to download and process before pushing the data into GBQ.
Furthermore, the API has authentication with a bearer token, so I was looking for a method to handle download and parsing of the data as well as the auth and found:
pd.read_csv('https://app.SOMEPROVIDER.com/api/reporting/download/SOMEID.csv.gz', storage_options={'Authorization': 'Bearer '+ bearer_token}, compression='gzip', header=0, sep=',', quotechar='"')
which works fantastically when using it in my Beam pipeline locally.
However, as soon as I upload the pipeline to dataflow and run it there, I get the error message
ValueError: storage_options passed with file object or non-fsspec file path
Full trace:
"apache_beam/runners/common.py", line 1223, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 572, in
apache_beam.runners.common.SimpleInvoker.invoke_process File
".\filename.py", line 144, in process File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
610, in read_csv return _read(filepath_or_buffer, kwds) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
462, in _read parser = TextFileReader(filepath_or_buffer, **kwds) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
819, in __init__ self._engine = self._make_engine(self.engine) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
1050, in _make_engine return mapping[engine](self.f, **self.options) #
type: ignore[call-arg] File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
1867, in __init__ self._open_handles(src, kwds) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
1362, in _open_handles self.handles = get_handle( File
"/usr/local/lib/python3.8/site-packages/pandas/io/common.py", line
558, in get_handle ioargs = _get_filepath_or_buffer( File
"/usr/local/lib/python3.8/site-packages/pandas/io/common.py", line
286, in _get_filepath_or_buffer raise ValueError( ValueError:
storage_options passed with file object or non-fsspec file path During
handling of the above exception, another exception occurred: Traceback
(most recent call last): File
"/usr/local/lib/python3.8/site-packages/dataflow_worker/batchworker.py",
line 651, in do_work work_executor.execute() File
"/usr/local/lib/python3.8/site-packages/dataflow_worker/executor.py",
line 179, in execute op.start() File
"dataflow_worker/shuffle_operations.py", line 63, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 64, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 79, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 80, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "dataflow_worker/shuffle_operations.py", line 84, in
dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
File "apache_beam/runners/worker/operations.py", line 353, in
apache_beam.runners.worker.operations.Operation.output File
"apache_beam/runners/worker/operations.py", line 215, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "dataflow_worker/shuffle_operations.py", line 261, in
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "dataflow_worker/shuffle_operations.py", line 268, in
dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
File "apache_beam/runners/worker/operations.py", line 353, in
apache_beam.runners.worker.operations.Operation.output File
"apache_beam/runners/worker/operations.py", line 215, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/worker/operations.py", line 713, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/common.py", line 1225, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 1290, in
apache_beam.runners.common.DoFnRunner._reraise_augmented File
"apache_beam/runners/common.py", line 1223, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 752, in
apache_beam.runners.common.PerWindowInvoker.invoke_process File
"apache_beam/runners/common.py", line 875, in
apache_beam.runners.common.PerWindowInvoker._invoke_process_per_window
File "apache_beam/runners/common.py", line 1386, in
apache_beam.runners.common._OutputProcessor.process_outputs File
"apache_beam/runners/worker/operations.py", line 215, in
apache_beam.runners.worker.operations.SingletonConsumerSet.receive
File "apache_beam/runners/worker/operations.py", line 712, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/worker/operations.py", line 713, in
apache_beam.runners.worker.operations.DoOperation.process File
"apache_beam/runners/common.py", line 1225, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 1306, in
apache_beam.runners.common.DoFnRunner._reraise_augmented File
"apache_beam/runners/common.py", line 1223, in
apache_beam.runners.common.DoFnRunner.process File
"apache_beam/runners/common.py", line 572, in
apache_beam.runners.common.SimpleInvoker.invoke_process File
".\filename.py", line 144, in process File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
610, in read_csv return _read(filepath_or_buffer, kwds) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
462, in _read parser = TextFileReader(filepath_or_buffer, **kwds) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
819, in __init__ self._engine = self._make_engine(self.engine) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
1050, in _make_engine return mapping[engine](self.f, **self.options) #
type: ignore[call-arg] File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
1867, in __init__ self._open_handles(src, kwds) File
"/usr/local/lib/python3.8/site-packages/pandas/io/parsers.py", line
1362, in _open_handles self.handles = get_handle( File
"/usr/local/lib/python3.8/site-packages/pandas/io/common.py", line
558, in get_handle ioargs = _get_filepath_or_buffer( File
"/usr/local/lib/python3.8/site-packages/pandas/io/common.py", line
286, in _get_filepath_or_buffer raise ValueError( ValueError:
storage_options passed with file object or non-fsspec file path [while
running 'Fetch actual report data'] ```
Does anyone know why that works locally but not in the cloud? I assume it might have to do with the filesystem and temporary files - but then the error message does not make a lot of sense...
According to the the pandas doc, the storage_options parameter is handed to urllib for https links and only to fsspec for s3 and gcs paths. see here

It turned out it was just a version issue. The interpretation of the storage options argument as authorization info did not exist in the pandas version that is included in the dataflow images and when I passed a local "wheel" of a the newest available pandas version with the --extra_package parameter, the issue resolved itself.

Related

pandas 1.3.3 to_feather giving ArrowMemoryError

I have a dataset of size around 270MB and I use the following to write to feather file:
df.reset_index().to_feather(feather_path)
This gives me an error :
File "C:\apps\Python\lib\site-packages\pandas\util\_decorators.py", line 207, in wrapper
return func(*args, **kwargs)
File "C:\apps\Python\lib\site-packages\pandas\core\frame.py", line 2519, in to_feather
to_feather(self, path, **kwargs)
File "C:\apps\Python\lib\site-packages\pandas\io\feather_format.py", line 87, in to_feather
feather.write_feather(df, handles.handle, **kwargs)
File "C:\apps\Python\lib\site-packages\pyarrow\feather.py", line 152, in write_feather
table = Table.from_pandas(df, preserve_index=False)
File "pyarrow\table.pxi", line 1553, in pyarrow.lib.Table.from_pandas
File "C:\apps\Python\lib\site-packages\pyarrow\pandas_compat.py", line 607, in dataframe_to_arrays
arrays[i] = maybe_fut.result()
File "C:\apps\Python\lib\concurrent\futures\_base.py", line 438, in result
return self.__get_result()
File "C:\apps\Python\lib\concurrent\futures\_base.py", line 390, in __get_result
raise self._exception
File "C:\apps\Python\lib\concurrent\futures\thread.py", line 52, in run
result = self.fn(*self.args, **self.kwargs)
File "C:\apps\Python\lib\site-packages\pyarrow\pandas_compat.py", line 575, in convert_column
result = pa.array(col, type=type_, from_pandas=True, safe=safe)
File "pyarrow\array.pxi", line 302, in pyarrow.lib.array
File "pyarrow\array.pxi", line 83, in pyarrow.lib._ndarray_to_array
File "pyarrow\error.pxi", line 114, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: realloc of size 3221225472 failed
Note : This works well in PyCharm. No issues writing the feather file.
But when the python program is called in a Windows batch file like:
call python "myprogram.py"
and when I schedule the batch file in a task using Task Scheduler it fails with above memory error.
PyArrow version is 5.0.0 if that helps.
Any ideas please?

RASA init error : tensorflow.python.framework.errors_impl.InvalidArgumentError: assertion failed: [0] [Op:Assert] name: EagerVariableNameReuse

I am new to rasa . I installed rasa 2.4.1 in my windows 10, python 3.7.6 machine without any error . But when I initialise rasa project I get following error . I tried with multiple rasa2.x versions and multiple tensorflow installations . But no luck . Any help to resolve this issue is appreciated .
File "D:\NLP\rasa_env\Scripts\rasa.exe\__main__.py", line 7, in <module>
File "d:\nlp\rasa_env\lib\site-packages\rasa\__main__.py", line 116, in main
cmdline_arguments.func(cmdline_arguments)
File "d:\nlp\rasa_env\lib\site-packages\rasa\cli\scaffold.py", line 234, in run
init_project(args, path)
File "d:\nlp\rasa_env\lib\site-packages\rasa\cli\scaffold.py", line 129, in init_project
print_train_or_instructions(args, path)
File "d:\nlp\rasa_env\lib\site-packages\rasa\cli\scaffold.py", line 68, in print_train_or_instructions
training_result = rasa.train(domain, config, training_files, output)
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 109, in train
loop,
File "d:\nlp\rasa_env\lib\site-packages\rasa\utils\common.py", line 308, in run_in_loop
result = loop.run_until_complete(f)
File "c:\users\kni9kor\anaconda3\lib\asyncio\base_events.py", line 583, in run_until_complete
return future.result()
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 174, in train_async
finetuning_epoch_fraction=finetuning_epoch_fraction,
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 353, in _train_async_internal
finetuning_epoch_fraction=finetuning_epoch_fraction,
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 396, in _do_training
finetuning_epoch_fraction=finetuning_epoch_fraction,
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 818, in _train_nlu_with_validated_data
**additional_arguments,
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\train.py", line 116, in train
interpreter = trainer.train(training_data, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\model.py", line 209, in train
updates = component.train(working_data, self.config, **context)
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\classifiers\diet_classifier.py", line 810, in train
self.model = self._instantiate_model_class(model_data)
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\classifiers\diet_classifier.py", line 1132, in _instantiate_model_class
config=self.component_config,
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\classifiers\diet_classifier.py", line 1146, in __init__
super().__init__("DIET", config, data_signature, label_data)
File "d:\nlp\rasa_env\lib\site-packages\rasa\utils\tensorflow\models.py", line 705, in __init__
checkpoint_model=config[CHECKPOINT_MODEL],
File "d:\nlp\rasa_env\lib\site-packages\rasa\utils\tensorflow\models.py", line 91, in __init__
super().__init__(**kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\training\tracking\base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\keras\engine\training.py", line 308, in __init__
self._init_batch_counters()
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\training\tracking\base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\keras\engine\training.py", line 317, in _init_batch_counters
self._train_counter = variables.Variable(0, dtype='int64', aggregation=agg)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 262, in __call__
return cls._variable_v2_call(*args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 256, in _variable_v2_call
shape=shape)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 237, in <lambda>
previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2646, in default_variable_creator_v2
shape=shape)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 264, in __call__
return super(VariableMetaclass, cls).__call__(*args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 1518, in __init__
distribute_strategy=distribute_strategy)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 1666, in _init_from_args
graph_mode=self._in_graph_mode)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 243, in eager_safe_variable_handle
shape, dtype, shared_name, name, graph_mode, initial_value)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 175, in _variable_handle_from_shape_and_dtype
math_ops.logical_not(exists), [exists], name="EagerVariableNameReuse")
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 49, in _assert
_ops.raise_from_not_ok_status(e, name)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\framework\ops.py", line 6843, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: assertion failed: [0] [Op:Assert] name: EagerVariableNameReuse
Possible Solutions:
1.Kill Concurrent python programs (like Jupyter notebooks) that is trying to access Tensorflow simultaneously.
2.Setting the environment variable TF_FORCE_GPU_ALLOW_GROWTH to true seems to make this issue disapper:
import os
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = "true"
I have also attached following similar issues for reference which might help you out. link1 , link2, link3

Python 3.8 Downloading Packages/Modules error using PIP

I am trying to install numpy but it is giving this error please help what should I do ?
ERROR: Exception:
Traceback (most recent call last):
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_vendor\urllib3\response.py", line 425, in _error_catcher
yield
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_vendor\urllib3\response.py", line 507, in read
data = self._fp.read(amt) if not fp_closed else b""
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 62, in read
data = self.__fp.read(amt)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\http\client.py", line 454, in read
n = self.readinto(b)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\http\client.py", line 498, in readinto
n = self.fp.readinto(b)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\cli\base_command.py", line 186, in _main
status = self.run(options, args)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\commands\install.py", line 331, in run
resolver.resolve(requirement_set)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\legacy_resolve.py", line 177, in resolve
discovered_reqs.extend(self._resolve_one(requirement_set, req))
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\legacy_resolve.py", line 333, in _resolve_one
abstract_dist = self._get_abstract_dist_for(req_to_install)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\legacy_resolve.py", line 282, in _get_abstract_dist_for
abstract_dist = self.preparer.prepare_linked_requirement(req)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\operations\prepare.py", line 480, in prepare_linked_requirement
local_path = unpack_url(
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\operations\prepare.py", line 282, in unpack_url
return unpack_http_url(
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\operations\prepare.py", line 158, in unpack_http_url
from_path, content_type = _download_http_url(
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\operations\prepare.py", line 303, in _download_http_url
for chunk in download.chunks:
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\utils\ui.py", line 160, in iter
for x in it:
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_internal\network\utils.py", line 15, in response_chunks
for chunk in response.raw.stream(
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_vendor\urllib3\response.py", line 564, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_vendor\urllib3\response.py", line 529, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\contextlib.py", line 131, in __exit__
self.gen.throw(type, value, traceback)
File "c:\users\cutea\appdata\local\programs\python\python38-32\lib\site-packages\pip\_vendor\urllib3\response.py", line 430, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
pip._vendor.urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Read timed out.
Look directly at the last line :
Read timed out
Connect to wifi or faster internet and try again.
my internet connection was poor then i got this error. Then i tried it with faster connection and it worked for me...

AttributeError: 'S3File' object has not attribute 'getvalue', while running to_csv

I'm running to_csv command as follows to an ouput file on a s3 bucket with ServerSideEncryption enabled:
to_csv("s3://mys3bucket/result.csv",
storage_option={'s3_additional_kwargs':
{'ServerSideEncryption': 'AES256'}})
I'm getting the following attribute error:
File "/usr/lib/python2.7/site-packages/dask/dataframe/core.py", line 1091, in to_csv
return to_csv(self, filename, **kwargs)
File "/usr/lib/python2.7/site-packages/dask/dataframe/io/csv.py", line 577, in to_csv
delayed(values).compute(get=get, scheduler=scheduler)
File "/usr/lib/python2.7/site-packages/dask/base.py", line 156, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/lib/python2.7/site-packages/dask/base.py", line 400, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 2159, in get
direct=direct)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 1562, in gather
asynchronous=asynchronous)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 652, in sync
return sync(self.loop, func, *args, **kwargs)
File "/usr/lib/python2.7/site-packages/distributed/utils.py", line 275, in sync
six.reraise(*error[0])
File "/usr/lib/python2.7/site-packages/distributed/utils.py", line 260, in f
result[0] = yield make_coro()
File "/usr/lib64/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
File "/usr/lib64/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
File "/usr/lib64/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 1439, in _gather
traceback)
File "/usr/lib/python2.7/site-packages/dask/dataframe/io/csv.py", line 439, in _to_csv_chunk
df.to_csv(f, **kwargs)
File "/usr/lib64/python2.7/site-packages/pandas/core/frame.py", line 1745, in to_csv
formatter.save()
File "/usr/lib64/python2.7/site-packages/pandas/io/formats/csvs.py", line 161, in save
buf = f.getvalue()
File "/usr/lib/python2.7/site-packages/dask/bytes/utils.py", line 136, in __getattr__
return getattr(self.file, key)
AttributeError: 'S3File' object has no attribute 'getvalue'
I searched for this error, but couldn't find a relevant solution.
Do you have any idea?

Scrapyd S3 feed export "Connection Reset by Peer"

I'm running Scrapyd with a FEED_URI set to export to S3, but I received the following error at the very end of my scrape. Note that it successfully uploaded a few hundred kb of data to the bucket as the scrape began, then threw this error at the end:
2014-11-24 10:11:23+0000 [word] ERROR: Error storing csv feed (2285242 items) in: s3://kitchen.bucket/FoodItem.csv
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 783, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/python/threadpool.py", line 191, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/feedexport.py", line 101, in _store_in_thread
key.set_contents_from_file(file)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 1291, in set_contents_from_file
chunked_transfer=chunked_transfer, size=size)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 748, in send_file
chunked_transfer=chunked_transfer, size=size)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 949, in _send_file_internal
query_args=query_args
File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 664, in make_request
retry_handler=retry_handler
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 1068, in make_request
retry_handler=retry_handler)
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 939, in _mexe
request.body, request.headers)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 842, in sender
http_conn.send(chunk)
File "/usr/lib/python2.7/httplib.py", line 805, in send
self.sock.sendall(data)
File "/usr/lib/python2.7/ssl.py", line 329, in sendall
v = self.send(data[count:])
File "/usr/lib/python2.7/ssl.py", line 298, in send
v = self._sslobj.write(data)
socket.error: [Errno 104] Connection reset by peer
Looks similar to boto issue 2207. I'm using gbirke's MultiFeedExporter, and received a similar error on both of my items.