Job failed on Cloud ML after successful completion of 1000 - tensorflow

I had walked through this cloudML tutorial on census data: cloud.google.com/ml-engine/docs/how-tos/getting-started-training-prediction in which the Job was successful. However, when I walk through this tutorial on flower image data: https://cloud.google.com/blog/big-data/2016/12/how-to-classify-images-with-tensorflow-using-google-cloud-machine-learning-and-cloud-dataflow my training task appears to successful based on the completion of 1000 steps from the log. However, upon completion from this snapshot StackDriver logs, it says job failed. I have tried using the same structure replacing the command-line arguments from the census data walkthrough, deleted and recreated JOB_ID and --output_path user argument, used the STANDARD_1 scale tier but to no avail. Any help I can get from the community would be appreciated. Thanks!
Below are the errors, you can see that popped out towards the tail end of the logs snapshot:
{
textPayload: "The replica master 0 exited with a non-zero status of 1. Termination reason: Error.
Traceback (most recent call last):
File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 542, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 305, in main
run(model, argv)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 436, in run
dispatch(args, model, cluster, task)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 477, in dispatch
Trainer(args, model, cluster, task).run_training()
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 241, in run_training
self.eval(session)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 283, in eval
self.model.format_metric_values(self.evaluator.evaluate()))
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 95, in evaluate
return metric_values
File "/usr/lib/python2.7/contextlib.py", line 35, in __exit__
self.gen.throw(type, value, traceback)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 960, in managed_session
self.stop(close_summary_writer=close_summary_writer)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 788, in stop
stop_grace_period_secs=self._stop_grace_secs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 386, in join
six.reraise(*self._exc_info_to_raise)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/queue_runner_impl.py", line 234, in _run
sess.run(enqueue_op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
NotFoundError: Error executing an HTTP request (HTTP response code 404, error code 0, error message '')
when reading gs://project-166422-ml/User/flowers_User_20170522_121407/preproc/eval
[[Node: ReaderReadUpToV2 = ReaderReadUpToV2[_device="/job:localhost/replica:0/task:0/cpu:0"](TFRecordReaderV2, input_producer, ReaderReadUpToV2/num_records)]]
Caused by op u'ReaderReadUpToV2', defined at:
File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 542, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 305, in main
run(model, argv)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 436, in run
dispatch(args, model, cluster, task)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 477, in dispatch
Trainer(args, model, cluster, task).run_training()
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 241, in run_training
self.eval(session)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 283, in eval
self.model.format_metric_values(self.evaluator.evaluate()))
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 57, in evaluate
self.eval_batch_size)
File "/root/.local/lib/python2.7/site-packages/trainer/model.py", line 310, in build_eval_graph
return self.build_graph(data_paths, batch_size, GraphMod.EVALUATE)
File "/root/.local/lib/python2.7/site-packages/trainer/model.py", line 231, in build_graph
num_epochs=None if is_training else 2)
File "/root/.local/lib/python2.7/site-packages/trainer/util.py", line 52, in read_examples
filename_queue, batch_size)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/io_ops.py", line 226, in read_up_to
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 380, in _reader_read_up_to_v2
num_records=num_records, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1226, in __init__
self._traceback = _extract_stack()
NotFoundError (see above for traceback): Error executing an HTTP request (HTTP response code 404, error code 0, error message '')
when reading gs://project-166422-ml/User/flowers_User_20170522_121407/preproc/eval
[[Node: ReaderReadUpToV2 = ReaderReadUpToV2[_device="/job:localhost/replica:0/task:0/cpu:0"](TFRecordReaderV2, input_producer, ReaderReadUpToV2/num_records)]]
To find out more about why your job exited please check the logs: console.cloud.google.com/logs/viewer?project=123456234&resource=ml_job%2Fjob_id%2Fflowers_User_20170524_145125&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22flowers_User_20170524_145125%22"***

The error indicates a 404 not found when trying to read
gs://project-166422-ml/User/flowers_User_20170522_121407/preproc/eval
Does that file exist?
Based on the name I'm guessing that's evaluation data. So my guess is you are only running evaluation every 1000 steps which is why 1000 steps complete successfully. Then it tries to run evaluation and it fails because the data doesn't exist.

Related

Training the XSeg model for Deepfacelabs fails due to memory error

I'm new to deepfakes and I'm trying to do the 5XSeg) train.bat and everytime it finishes the filtering I get the following error. I use wf, and tried batch sizes from 1-8, always the same result. I have a Ryzen 5 3600, a 3080 Ti and 16 GB of RAM.
Using 26519 xseg labeled samples.
Traceback (most recent call last):
File "multiprocessing\queues.py", line 234, in _feed
File "multiprocessing\reduction.py", line 51, in dumps
MemoryError
Error:
Traceback (most recent call last):
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 1375, in _do_call
return fn(*args)
Traceback (most recent call last):
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 1360, in _run_fn
target_list, run_metadata)
File "multiprocessing\queues.py", line 234, in _feed
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 1453, in _call_tf_sessionrun
run_metadata)
File "multiprocessing\reduction.py", line 51, in dumps
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Attempting to perform BLAS operation using StreamExecutor without BLAS support
[[{{node MatMul}}]]
[[concat_6/concat/_3]]
(1) Internal: Attempting to perform BLAS operation using StreamExecutor without BLAS support
[[{{node MatMul}}]]
0 successful operations.
0 derived errors ignored.
MemoryError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\ModelBase.py", line 263, in update_sample_for_preview
self.get_history_previews()
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\ModelBase.py", line 383, in get_history_previews
return self.onGetPreview (self.sample_for_preview, for_history=True)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\Model_XSeg\Model.py", line 209, in onGetPreview
I, M, IM, = [ np.clip( nn.to_data_format(x,"NHWC", self.model_data_format), 0.0, 1.0) for x in ([image_np,mask_np] + self.view (image_np) ) ]
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\Model_XSeg\Model.py", line 141, in view
return nn.tf_sess.run ( [pred], feed_dict={self.model.input_t :input_np})
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 968, in run
run_metadata_ptr)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 1191, in _run
feed_dict_tensor, options, run_metadata)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 1369, in _do_run
run_metadata)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\client\session.py", line 1394, in _do_call
raise type(e)(node_def, op, message) # pylint: disable=no-value-for-parameter
tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
(0) Internal: Attempting to perform BLAS operation using StreamExecutor without BLAS support
[[node MatMul (defined at E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\layers\Dense.py:66) ]]
[[concat_6/concat/_3]]
(1) Internal: Attempting to perform BLAS operation using StreamExecutor without BLAS support
[[node MatMul (defined at E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\layers\Dense.py:66) ]]
0 successful operations.
0 derived errors ignored.
Errors may have originated from an input operation.
Input Source operations connected to node MatMul:
XSeg/dense1/weight/read (defined at E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\layers\Dense.py:47)
Reshape_60 (defined at E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\ops\__init__.py:182)
Input Source operations connected to node MatMul:
XSeg/dense1/weight/read (defined at E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\layers\Dense.py:47)
Reshape_60 (defined at E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\ops\__init__.py:182)
Original stack trace for 'MatMul':
File "threading.py", line 884, in _bootstrap
File "threading.py", line 916, in _bootstrap_inner
File "threading.py", line 864, in run
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\mainscripts\Trainer.py", line 58, in trainerThread
debug=debug)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\Model_XSeg\Model.py", line 17, in __init__
super().__init__(*args, force_model_class_name='XSeg', **kwargs)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\ModelBase.py", line 193, in __init__
self.on_initialize()
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\Model_XSeg\Model.py", line 103, in on_initialize
gpu_pred_logits_t, gpu_pred_t = self.model.flow(gpu_input_t, pretrain=self.pretrain)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\facelib\XSegNet.py", line 85, in flow
return self.model(x, pretrain=pretrain)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\models\ModelBase.py", line 117, in __call__
return self.forward(*args, **kwargs)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\models\XSeg.py", line 124, in forward
x = self.dense1(x)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\layers\LayerBase.py", line 14, in __call__
return self.forward(*args, **kwargs)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\leras\layers\Dense.py", line 66, in forward
x = tf.matmul(x, weight)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\util\dispatch.py", line 206, in wrapper
return target(*args, **kwargs)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\ops\math_ops.py", line 3655, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 5713, in mat_mul
name=name)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 750, in _apply_op_helper
attrs=attr_protos, op_def=op_def)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\framework\ops.py", line 3569, in _create_op_internal
op_def=op_def)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\python-3.6.8\lib\site-packages\tensorflow\python\framework\ops.py", line 2045, in __init__
self._traceback = tf_stack.extract_stack_for_node(self._c_op)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\mainscripts\Trainer.py", line 58, in trainerThread
debug=debug)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\Model_XSeg\Model.py", line 17, in __init__
super().__init__(*args, force_model_class_name='XSeg', **kwargs)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\ModelBase.py", line 216, in __init__
self.update_sample_for_preview(choose_preview_history=self.choose_preview_history)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\ModelBase.py", line 265, in update_sample_for_preview
self.sample_for_preview = self.generate_next_samples()
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\models\ModelBase.py", line 461, in generate_next_samples
sample.append ( generator.generate_next() )
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\samplelib\SampleGeneratorBase.py", line 21, in generate_next
self.last_generation = next(self)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\samplelib\SampleGeneratorFace.py", line 112, in __next__
return next(generator)
File "E:\DeepFaceLab_NVIDIA_RTX3000_series\_internal\DeepFaceLab\core\joblib\SubprocessGenerator.py", line 73, in __next__
gen_data = self.cs_queue.get()
File "multiprocessing\queues.py", line 94, in get
File "multiprocessing\connection.py", line 216, in recv_bytes
File "multiprocessing\connection.py", line 318, in _recv_bytes
File "multiprocessing\connection.py", line 344, in _get_more_data
MemoryError
Reducing the batch size didn't help as well as increasing the page file. I tried to Google it but I couldn't find a solution.

RASA init error : tensorflow.python.framework.errors_impl.InvalidArgumentError: assertion failed: [0] [Op:Assert] name: EagerVariableNameReuse

I am new to rasa . I installed rasa 2.4.1 in my windows 10, python 3.7.6 machine without any error . But when I initialise rasa project I get following error . I tried with multiple rasa2.x versions and multiple tensorflow installations . But no luck . Any help to resolve this issue is appreciated .
File "D:\NLP\rasa_env\Scripts\rasa.exe\__main__.py", line 7, in <module>
File "d:\nlp\rasa_env\lib\site-packages\rasa\__main__.py", line 116, in main
cmdline_arguments.func(cmdline_arguments)
File "d:\nlp\rasa_env\lib\site-packages\rasa\cli\scaffold.py", line 234, in run
init_project(args, path)
File "d:\nlp\rasa_env\lib\site-packages\rasa\cli\scaffold.py", line 129, in init_project
print_train_or_instructions(args, path)
File "d:\nlp\rasa_env\lib\site-packages\rasa\cli\scaffold.py", line 68, in print_train_or_instructions
training_result = rasa.train(domain, config, training_files, output)
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 109, in train
loop,
File "d:\nlp\rasa_env\lib\site-packages\rasa\utils\common.py", line 308, in run_in_loop
result = loop.run_until_complete(f)
File "c:\users\kni9kor\anaconda3\lib\asyncio\base_events.py", line 583, in run_until_complete
return future.result()
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 174, in train_async
finetuning_epoch_fraction=finetuning_epoch_fraction,
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 353, in _train_async_internal
finetuning_epoch_fraction=finetuning_epoch_fraction,
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 396, in _do_training
finetuning_epoch_fraction=finetuning_epoch_fraction,
File "d:\nlp\rasa_env\lib\site-packages\rasa\train.py", line 818, in _train_nlu_with_validated_data
**additional_arguments,
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\train.py", line 116, in train
interpreter = trainer.train(training_data, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\model.py", line 209, in train
updates = component.train(working_data, self.config, **context)
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\classifiers\diet_classifier.py", line 810, in train
self.model = self._instantiate_model_class(model_data)
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\classifiers\diet_classifier.py", line 1132, in _instantiate_model_class
config=self.component_config,
File "d:\nlp\rasa_env\lib\site-packages\rasa\nlu\classifiers\diet_classifier.py", line 1146, in __init__
super().__init__("DIET", config, data_signature, label_data)
File "d:\nlp\rasa_env\lib\site-packages\rasa\utils\tensorflow\models.py", line 705, in __init__
checkpoint_model=config[CHECKPOINT_MODEL],
File "d:\nlp\rasa_env\lib\site-packages\rasa\utils\tensorflow\models.py", line 91, in __init__
super().__init__(**kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\training\tracking\base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\keras\engine\training.py", line 308, in __init__
self._init_batch_counters()
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\training\tracking\base.py", line 457, in _method_wrapper
result = method(self, *args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\keras\engine\training.py", line 317, in _init_batch_counters
self._train_counter = variables.Variable(0, dtype='int64', aggregation=agg)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 262, in __call__
return cls._variable_v2_call(*args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 256, in _variable_v2_call
shape=shape)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 237, in <lambda>
previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2646, in default_variable_creator_v2
shape=shape)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\variables.py", line 264, in __call__
return super(VariableMetaclass, cls).__call__(*args, **kwargs)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 1518, in __init__
distribute_strategy=distribute_strategy)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 1666, in _init_from_args
graph_mode=self._in_graph_mode)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 243, in eager_safe_variable_handle
shape, dtype, shared_name, name, graph_mode, initial_value)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 175, in _variable_handle_from_shape_and_dtype
math_ops.logical_not(exists), [exists], name="EagerVariableNameReuse")
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 49, in _assert
_ops.raise_from_not_ok_status(e, name)
File "d:\nlp\rasa_env\lib\site-packages\tensorflow\python\framework\ops.py", line 6843, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: assertion failed: [0] [Op:Assert] name: EagerVariableNameReuse
Possible Solutions:
1.Kill Concurrent python programs (like Jupyter notebooks) that is trying to access Tensorflow simultaneously.
2.Setting the environment variable TF_FORCE_GPU_ALLOW_GROWTH to true seems to make this issue disapper:
import os
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = "true"
I have also attached following similar issues for reference which might help you out. link1 , link2, link3

ValueError: Operation u'tpu_140462710602256/VarIsInitializedOp' has been marked as not fetchable

The code works fine on GPU and CPU.But when I use keras_to_tpu_model function to make the model able to run on TPU, the error occurred.
This is the full output on colab:https://colab.research.google.com/gist/WangHexie/2252beb26f16354cb6e9ba2639970e5b/tpu-error.ipynb
Change runtype to TPU,I think this can be reproduced.
Code on github:https://github.com/WangHexie/DHNE/blob/master/src/hypergraph_embedding.py#L60
You can test the code on GPU by changing to the gpu branch.
Traceback
Traceback (most recent call last):
File "src/hypergraph_embedding.py", line 158, in <module>
h.train(dataset)
File "src/hypergraph_embedding.py", line 75, in train
epochs=self.options.epochs_to_train, verbose=1)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/engine/training.py", line 2177, in fit_generator
initial_epoch=initial_epoch)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/engine/training_generator.py", line 176, in fit_generator
x, y, sample_weight=sample_weight, class_weight=class_weight)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/engine/training.py", line 1940, in train_on_batch
outputs = self.train_function(ins)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py", line 1238, in __call__
infeed_manager)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py", line 1143, in _tpu_model_ops_for_input_specs
infeed_manager)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py", line 1053, in _specialize_model
_model_fn, inputs=[[]] * self._tpu_assignment.num_towers)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu.py", line 687, in split_compile_and_replicate
outputs = computation(*computation_inputs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py", line 959, in _model_fn
self.model.cpu_optimizer)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py", line 378, in _clone_optimizer
config = optimizer.get_config()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/optimizers.py", line 275, in get_config
'lr': float(K.get_value(self.lr)),
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/backend.py", line 2709, in get_value
return x.eval(session=get_session())
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/backend.py", line 469, in get_session
_initialize_variables(session)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/keras/backend.py", line 731, in _initialize_variables
[variables_module.is_variable_initialized(v) for v in candidate_vars])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1137, in _run
self._graph, fetches, feed_dict_tensor, feed_handles=feed_handles)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 484, in __init__
self._assert_fetchable(graph, fetch.op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 497, in _assert_fetchable
'Operation %r has been marked as not fetchable.' % op.name)
ValueError: Operation u'tpu_140276544043536/VarIsInitializedOp' has been marked as not fetchable.
I have a same issue which confuses me two days. I find a solution is that just switch to using tf.train.RMSPropOptimizer instead of using RMSProp from tensorflow.keras.optimizers.

the error message while running model_test.py for tensorflow deeplab

I have been trying to test the installation of deeplab by following this
# From tensorflow/models/research/
python deeplab/model_test.py
However, I got the following error message, in specific,
2018-04-25 10:54:23.488868: W tensorflow/core/framework/op_kernel.cc:1273] OP_REQUIRES failed at mkl_concat_op.cc:784 : Aborted: Operation received an exception:Status: 3, message: could not create a concat primitive descriptor, in file tensorflow/core/kernels/mkl_concat_op.cc:781
E...
======================================================================
ERROR: testForwardpassDeepLabv3plus (__main__.DeeplabModelTest)
----------------------------------------------------------------------
The complete traceback is as follows
2018-04-25 10:54:23.488868: W tensorflow/core/framework/op_kernel.cc:1273] OP_REQUIRES failed at mkl_concat_op.cc:784 : Aborted: Operation received an exception:Status: 3, message: could not create a concat primitive descriptor, in file tensorflow/core/kernels/mkl_concat_op.cc:781
E...
======================================================================
ERROR: testForwardpassDeepLabv3plus (__main__.DeeplabModelTest)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
return fn(*args)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1312, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1420, in _call_tf_sessionrun
status, run_metadata)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not create a concat primitive descriptor, in file tensorflow/core/kernels/mkl_concat_op.cc:781
[[Node: concat = _MklConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](ResizeBilinear, aspp0/Relu, concat/axis, DMT/_283, aspp0/Relu:1, DMT/_284)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "deeplab/model_test.py", line 108, in testForwardpassDeepLabv3plus
outputs_to_scales_to_logits = sess.run(outputs_to_scales_to_logits)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 905, in run
run_metadata_ptr)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1140, in _run
feed_dict_tensor, options, run_metadata)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
run_metadata)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not create a concat primitive descriptor, in file tensorflow/core/kernels/mkl_concat_op.cc:781
[[Node: concat = _MklConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](ResizeBilinear, aspp0/Relu, concat/axis, DMT/_283, aspp0/Relu:1, DMT/_284)]]
Caused by op 'concat', defined at:
File "deeplab/model_test.py", line 120, in <module>
tf.test.main()
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/platform/test.py", line 76, in main
return _googletest.main(argv)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/platform/googletest.py", line 99, in main
benchmark.benchmarks_main(true_main=main_wrapper)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/platform/benchmark.py", line 338, in benchmarks_main
true_main()
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/platform/googletest.py", line 98, in main_wrapper
return app.run(main=g_main, argv=args)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/platform/googletest.py", line 69, in g_main
return unittest_main(argv=argv)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/main.py", line 95, in __init__
self.runTests()
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/main.py", line 256, in runTests
self.result = testRunner.run(self.test)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/runner.py", line 176, in run
test(result)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/suite.py", line 84, in __call__
return self.run(*args, **kwds)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/suite.py", line 122, in run
test(result)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/suite.py", line 84, in __call__
return self.run(*args, **kwds)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/suite.py", line 122, in run
test(result)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/case.py", line 653, in __call__
return self.run(*args, **kwds)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/unittest/case.py", line 605, in run
testMethod()
File "deeplab/model_test.py", line 105, in testForwardpassDeepLabv3plus
image_pyramid=[1.0])
File "/data/dsp_emerging/ugwz/virtualE/deeplab/models/research/deeplab/model.py", line 296, in multi_scale_logits
fine_tune_batch_norm=fine_tune_batch_norm)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/models/research/deeplab/model.py", line 461, in _get_logits
fine_tune_batch_norm=fine_tune_batch_norm)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/models/research/deeplab/model.py", line 424, in _extract_features
concat_logits = tf.concat(branch_logits, 3)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 1181, in concat
return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 949, in concat_v2
"ConcatV2", values=values, axis=axis, name=name)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3290, in create_op
op_def=op_def)
File "/data/dsp_emerging/ugwz/virtualE/deeplab/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1654, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
AbortedError (see above for traceback): Operation received an exception:Status: 3, message: could not create a concat primitive descriptor, in file tensorflow/core/kernels/mkl_concat_op.cc:781
[[Node: concat = _MklConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _kernel="MklOp", _device="/job:localhost/replica:0/task:0/device:CPU:0"](ResizeBilinear, aspp0/Relu, concat/axis, DMT/_283, aspp0/Relu:1, DMT/_284)]]
----------------------------------------------------------------------
Ran 5 tests in 23.571s
FAILED (errors=1)
Roll back to Tensorflow 1.6
This issue is still being addressed in versions 1.7 and above.
https://github.com/tensorflow/tensorflow/issues/17494
In Google Colab, in Runtime type Python2 or Python3, with GPU, I run without any error using commands:
!git clone https://github.com/tensorflow/models.git
%env PYTHONPATH=/env/python/:/content/models/research/:/content/models/research/slim
!python /content/models/research/deeplab/model_test.py

Scrapyd S3 feed export "Connection Reset by Peer"

I'm running Scrapyd with a FEED_URI set to export to S3, but I received the following error at the very end of my scrape. Note that it successfully uploaded a few hundred kb of data to the bucket as the scrape began, then threw this error at the end:
2014-11-24 10:11:23+0000 [word] ERROR: Error storing csv feed (2285242 items) in: s3://kitchen.bucket/FoodItem.csv
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 783, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/python/threadpool.py", line 191, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/feedexport.py", line 101, in _store_in_thread
key.set_contents_from_file(file)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 1291, in set_contents_from_file
chunked_transfer=chunked_transfer, size=size)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 748, in send_file
chunked_transfer=chunked_transfer, size=size)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 949, in _send_file_internal
query_args=query_args
File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 664, in make_request
retry_handler=retry_handler
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 1068, in make_request
retry_handler=retry_handler)
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 939, in _mexe
request.body, request.headers)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 842, in sender
http_conn.send(chunk)
File "/usr/lib/python2.7/httplib.py", line 805, in send
self.sock.sendall(data)
File "/usr/lib/python2.7/ssl.py", line 329, in sendall
v = self.send(data[count:])
File "/usr/lib/python2.7/ssl.py", line 298, in send
v = self._sslobj.write(data)
socket.error: [Errno 104] Connection reset by peer
Looks similar to boto issue 2207. I'm using gbirke's MultiFeedExporter, and received a similar error on both of my items.