I tried using an InMemoryEvaluatorHook with a TPUEstimator to get validation statistics while training my model. Using a loop of estimator.train() and estimator.evaluate() was too expensive as it rebuilt the graph every epoch, rather than trying to reuse it (as referenced in this issue: https://github.com/tensorflow/tensorflow/issues/13895). This is the basic code I use:
estimator = tf.contrib.tpu.TPUEstimator(
model_fn=model_fn,
config=run_config,
use_tpu=True,
train_batch_size=self.batch_size,
eval_batch_size=self.batch_size,
predict_batch_size=self.batch_size,
params={})
train_fn = lambda params: input_fn(
'train', self.data_dir, batch_size=params['batch_size'], train=True)
val_fn = lambda params: input_fn(
'validation',
self.data_dir,
batch_size=params['batch_size'],
train=False)
train_hook = tf.contrib.estimator.InMemoryEvaluatorHook(
estimator,
val_fn,
steps=self.steps_per_val_epoch,
every_n_iter=self.steps_per_epoch)
estimator.train(
input_fn=train_fn,
steps=self.steps_per_epoch * self.max_num_training_epochs,
hooks=[
train_hook,
])
This resulted in the following error:
Traceback (most recent call last):
File "dev/google_communicator/worker.py", line 160, in <module>
main()
File "dev/google_communicator/worker.py", line 133, in main
results = evaluator.eval(inputs, outputs)
File "/darch/deep_architect/contrib/misc/evaluators/tensorflow/tpu_estimator_classification.py", line 278, in eval
train_hook,
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2409, in train
rendezvous.raise_errors()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 128, in raise_errors
six.reraise(typ, value, traceback)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2403, in train
saving_listeners=saving_listeners
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1241, in _train_model_default
saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1468, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 921, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 631, in __init__
h.begin()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/estimator/python/estimator/hooks.py", line 135, in begin
self._input_fn, self._hooks, checkpoint_path=None)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1484, in _evaluate_build_graph
self._call_model_fn_eval(input_fn, self.config))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1520, in _call_model_fn_eval
features, labels, model_fn_lib.ModeKeys.EVAL, config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2195, in _call_model_fn
features, labels, mode, config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1195, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2631, in _model_fn
rendezvous=self._rendezvous[mode]),
KeyError: 'eval'
Is there a better way to get validation statistics every epochs with TPUs? If not, how are you supposed to do validation?
Edit: I seemed to have gotten past this error by running estimator.train() and estimator.evaluate() for one step without the hook, and then running the full training with the hook. Unfortunately, after the first evaluation, there is an error with restarting training:
Traceback (most recent call last):
File "dev/google_communicator/worker.py", line 160, in <module>
main()
File "dev/google_communicator/worker.py", line 133, in main
results = evaluator.eval(inputs, outputs)
File "/darch/deep_architect/contrib/misc/evaluators/tensorflow/tpu_estimator_classification.py", line 329, in eval
train_hook,
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2409, in train
rendezvous.raise_errors()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/error_handling.py", line 128, in raise_errors
six.reraise(typ, value, traceback)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py", line 2403, in train
saving_listeners=saving_listeners
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1241, in _train_model_default
saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1471, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 671, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1156, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1255, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1240, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1312, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1076, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.FailedPreconditionError: The TPU system has not been initialized.
[[{{node TPUReplicate/_compile/_14248540389241865347/_28}} = TPUCompile[NumDynamicShapes=0, Tguaranteed_constants=[], function=cluster_18378946049549366873_f15n_0[], metadata="\n\006\010...6\323\352L", num_computations=1, _device="/job:worker/replica:0/task:0/device:CPU:0"](^cluster/control_before/_0)]]
[[{{node tpu_compile_succeeded_assert/_1897752282630996029/_29_G679}} = _Recv[client_terminated=false, recv_device="/job:worker/replica:0/task:0/device:TPU:2", send_device="/job:worker/replica:0/task:0/device:CPU:0", send_device_incarnation=2337451129362726278, tensor_name="edge_174_tpu_compile_succeeded_assert/_1897752282630996029/_29", tensor_type=DT_FLOAT, _device="/job:worker/replica:0/task:0/device:TPU:2"]()]]
To clarify, the following things happen before the error is thrown: the two initializing train and evaluate calls to the estimator, training for one epoch, evaluating on the validation set. When the estimator tries to restart training, this exception is thrown.
This open issue might be relevant: https://github.com/tensorflow/tensor2tensor/issues/1202
Related
I am trying to build an object detection model with my custom dataset having only 1 class.
While following all the procedures explained in the tutorial the script crashes and log out the following error
tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
(0) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[{{node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d}}]]
[[Loss/unstack_1/_10307]]
(1) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[{{node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d}}]]
0 successful operations.
0 derived errors ignored.
tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
(0) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[{{node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d}}]]
[[Loss/unstack_1/_10307]]
(1) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[{{node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "model_main.py", line 109, in <module>
tf.app.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "model_main.py", line 105, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 367, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1192, in _train_model_default
saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1484, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1252, in run
run_metadata=run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1353, in run
raise six.reraise(*original_exc_info)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/six.py", line 703, in reraise
raise value
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1338, in run
return self._sess.run(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1411, in run
run_metadata=run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1169, in run
return self._sess.run(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
(0) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d (defined at /home/stud/hammadal/custom-model/models/research/slim/nets/inception_v2.py:129) ]]
[[Loss/unstack_1/_10307]]
(1) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d (defined at /home/stud/hammadal/custom-model/models/research/slim/nets/inception_v2.py:129) ]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d':
File "model_main.py", line 109, in <module>
tf.app.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "model_main.py", line 105, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 367, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1188, in _train_model_default
features, labels, ModeKeys.TRAIN, self.config)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1146, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/stud/hammadal/custom-model/models/research/object_detection/model_lib.py", line 308, in model_fn
features[fields.InputDataFields.true_image_shape])
File "/home/stud/hammadal/custom-model/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 600, in predict
preprocessed_inputs)
File "/home/stud/hammadal/custom-model/models/research/object_detection/models/ssd_inception_v2_feature_extractor.py", line 130, in extract_features
scope=scope)
File "/home/stud/hammadal/custom-model/models/research/slim/nets/inception_v2.py", line 129, in inception_v2_base
scope=end_point)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 182, in func_with_args
return func(*args, **current_args)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 2784, in separable_convolution2d
outputs = layer.apply(inputs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1479, in apply
return self.__call__(inputs, *args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 537, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 634, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 146, in wrapper
), args, kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 446, in converted_call
return _call_unconverted(f, args, kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 253, in _call_unconverted
return f(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/keras/layers/convolutional.py", line 1658, in call
data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 793, in separable_conv2d
name=name)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 1953, in conv2d
name=name)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 1071, in conv2d
data_format=data_format, dilations=dilations, name=name)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
op_def=op_def)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
op_def=op_def)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2005, in __init__
self._traceback = tf_stack.extract_stack()
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "model_main.py", line 109, in <module>
tf.app.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "model_main.py", line 105, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 367, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1192, in _train_model_default
saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1484, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1252, in run
run_metadata=run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1353, in run
raise six.reraise(*original_exc_info)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/six.py", line 703, in reraise
raise value
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1338, in run
return self._sess.run(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1411, in run
run_metadata=run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1169, in run
return self._sess.run(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: 2 root error(s) found.
(0) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d (defined at /home/stud/hammadal/custom-model/models/research/slim/nets/inception_v2.py:129) ]]
[[Loss/unstack_1/_10307]]
(1) Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d (defined at /home/stud/hammadal/custom-model/models/research/slim/nets/inception_v2.py:129) ]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d':
File "model_main.py", line 109, in <module>
tf.app.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "model_main.py", line 105, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 367, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1188, in _train_model_default
features, labels, ModeKeys.TRAIN, self.config)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1146, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/home/stud/hammadal/custom-model/models/research/object_detection/model_lib.py", line 308, in model_fn
features[fields.InputDataFields.true_image_shape])
File "/home/stud/hammadal/custom-model/models/research/object_detection/meta_architectures/ssd_meta_arch.py", line 600, in predict
preprocessed_inputs)
File "/home/stud/hammadal/custom-model/models/research/object_detection/models/ssd_inception_v2_feature_extractor.py", line 130, in extract_features
scope=scope)
File "/home/stud/hammadal/custom-model/models/research/slim/nets/inception_v2.py", line 129, in inception_v2_base
scope=end_point)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 182, in func_with_args
return func(*args, **current_args)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/layers.py", line 2784, in separable_convolution2d
outputs = layer.apply(inputs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1479, in apply
return self.__call__(inputs, *args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/layers/base.py", line 537, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py", line 634, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 146, in wrapper
), args, kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 446, in converted_call
return _call_unconverted(f, args, kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 253, in _call_unconverted
return f(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/keras/layers/convolutional.py", line 1658, in call
data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 793, in separable_conv2d
name=name)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 1953, in conv2d
name=name)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 1071, in conv2d
data_format=data_format, dilations=dilations, name=name)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
op_def=op_def)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
op_def=op_def)
File "/nfs/student/hammadal/custom-model/tf1.14/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2005, in __init__
self._traceback = tf_stack.extract_stack()
This is being faced while running it on to a server where I can use the power of the GPU.
When I run the script on the local machine using only CPU and batch size of 1 the script executes.
The script being used is from the tensorflow official repo HERE.
The server hardware information is as follow:
> OS: Ubuntu x86_64 memory: 503GiB
> system memory processor: Intel(R)
> Xeon(R) CPU E5-2630 v4 # 2.20GHz
> display: GV100GL [Tesla V100 PCIe 32GB]
Libraries:
> tensorflow-gpu: 1.14
> numpy: 1.16
> absl-py 0.9
I have been trying to work my way through since last 2 weeks. If someone can help or guide me what do I need to read I would highly appericiate it
It looks like cuDNN failed to initialize. Which is related more so to TensorFlow. Try using the following on the server, which should install cuDNN properly:
conda install tensorflow-gpu
I want to add some additional layers behind the Bert model, when loading the pre_training model, delete these layers from the list that needs to be loaded, but it fails, and if still reports that cannot find the parameter. use tensorflow 1.14.0
Can someone tell me how to do it? Thanks very much!
enter image description here
enter image description here
python
final_hidden_shape = modeling.get_shape_list(final_hidden,
expected_rank=3)
batch_size = final_hidden_shape[0]
seq_length = final_hidden_shape[1]
hidden_size = final_hidden_shape[2]
lstm=tf.keras.layers.LSTM(hidden_size, return_sequences=True,
return_state=True,name='cls/squad/mylstm') #this my layer
final_hidden, final_memory_state, final_carry_state = lstm(final_hidden)
output_weights = tf.get_variable(
"cls/squad/output_weights", [2, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"cls/squad/output_bias", [2], initializer=tf.zeros_initializer())
final_hidden_matrix = tf.reshape(final_hidden,
[batch_size * seq_length, hidden_size])
logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
tvars = tf.trainable_variables() #获得所有可训练的权重参数
tvars=tvars[:198] #here I want delet the layer that I add
print(tvars)
initialized_variable_names = {}
scaffold_fn = None
if init_checkpoint:
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
print(assignment_map)
print(initialized_variable_names)
if use_tpu:
# def tpu_scaffold():
# tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
# sess = tf.Session()
# saver = tf.train.Saver(tvars)
# saver.restore(sess,init_checkpoint)
# return tf.train.Scaffold()
# scaffold_fn = tpu_scaffold
pass
else:
# pass
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
I got the following error
INFO:tensorflow:Restoring parameters from tmp/squad2.0_base/model.ckpt-6000
I0515 06:17:15.064314 139686871394176 saver.py:1280] Restoring parameters
from tmp/squad2.0_base/model.ckpt-6000
2020-05-15 06:17:15.907003: W
tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time
warning): Not using XLA:CPU for cluster because envvar TF_XLA_FLAGS=--
tf_xla_cpu_global_jit was not set. If you want XLA:CPU, either set that
envvar, or use experimental_jit_scope to enable XLA:CPU. To confirm that XLA
is active, pass --vmodule=xla_compilation_cache=1 (as a proper command-line
flag, not via TF_XLA_FLAGS) or set the envvar XLA_FLAGS=--xla_hlo_profile.
2020-05-15 06:17:15.956482: W tensorflow/core/framework/op_kernel.cc:1502]
OP_REQUIRES failed at save_restore_v2_ops.cc:184 : Not found: Key
cls/squad/mylstm/bias not found in checkpoint
ERROR:tensorflow:Error recorded from training_loop: Restoring from checkpoint
failed. This is most likely due to a Variable name or other graph key that is
missing from the checkpoint. Please ensure that you have not altered the
graph
expected based on the checkpoint. Original error:
Key cls/squad/mylstm/bias not found in checkpoint
[[node save/RestoreV2 (defined at run_squad.py:1278) ]]
Original stack trace for 'save/RestoreV2':
File "run_squad.py", line 1350, in <module>
tf.app.run()
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 250, in
_run_main
sys.exit(main(argv))
File "run_squad.py", line 1278, in main
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line
2871, in train
saving_listeners=saving_listeners)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow_estimator/python/estimator/estimator.py", line 367, in
train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in
_train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow_estimator/python/estimator/estimator.py", line 1192, in
_train_model_default
saving_listeners)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow_estimator/python/estimator/estimator.py", line 1480, in
_train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 584, in
MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 1007, in
__init__
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 725, in
__init__
self._sess = _RecoverableSession(self._coordinated_creator)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 1200, in
__init__
_WrappedSession.__init__(self, self._create_session())
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 1205, in
_create_session
return self._sess_creator.create_session()
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 871, in
create_session
self.tf_sess = self._session_creator.create_session()
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 638, in
create_session
self._scaffold.finalize()
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/monitored_session.py", line 237, in
finalize
self._saver.build()
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/saver.py", line 837, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/saver.py", line 875, in _build
build_restore=build_restore)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/saver.py", line 502, in _build_internal
restore_sequentially, reshape)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/saver.py", line 381, in
_AddShardedRestoreOps
name="restore_shard"))
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/saver.py", line 328, in _AddRestoreOps
restore_sequentially)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/training/saver.py", line 575, in bulk_restore
return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/ops/gen_io_ops.py", line 1696, in restore_v2
name=name)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/framework/op_def_library.py", line 788, in
_apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/framework/ops.py", line 3616, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/framework/ops.py", line 2005, in __init__
self._traceback = tf_stack.extract_stack()
E0515 06:17:15.972786 139686871394176 error_handling.py:70] Error recorded
from training_loop: Restoring from checkpoint failed. This is most likely
due to a Variable name or other graph key that is missing from the
checkpoint. Please ensure that you have not altered the graph expected based
on the checkpoint. Original error:
I'm training ssd_mobilenet_v1_0.75_depth_300x300_coco14 on CoLab.
Here is the command:
!python /root/models/research/object_detection/model_main.py \
--pipeline_config_path=/drive/data/ssd_mobilenet_v1_0.75_depth_300x300_coco14_sync.config \
--model_dir=/drive/data/ \
--num_train_steps=50000 \
--num_eval_steps=2000 \
--alsologtostderr
... and the output:
Traceback (most recent call last):
File "/root/models/research/object_detection/model_main.py", line 101, in <module>
tf.app.run()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/root/models/research/object_detection/model_main.py", line 97, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/training.py", line 451, in train_and_evaluate
return executor.run()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/training.py", line 590, in run
return self.run_local()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/training.py", line 691, in run_local
saving_listeners=saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 376, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 1145, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 1173, in _train_model_default
saving_listeners)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 1451, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py", line 583, in run
run_metadata=run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py", line 1059, in run
run_metadata=run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py", line 1150, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python3.6/dist-packages/six.py", line 693, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py", line 1135, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py", line 1215, in run
run_metadata=run_metadata))
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 464, in after_run
if self._save(run_context.session, global_step):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 489, in _save
if l.after_save(session, step):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/training.py", line 497, in after_save
self._evaluate(global_step_value) # updates self.eval_result
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/training.py", line 517, in _evaluate
self._evaluator.evaluate_and_export())
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/training.py", line 884, in evaluate_and_export
hooks=self._eval_spec.hooks)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 463, in evaluate
input_fn, hooks, checkpoint_path)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 1463, in _evaluate_build_graph
features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/estimator/estimator.py", line 1133, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/root/models/research/object_detection/model_lib.py", line 387, in model_fn
include_metrics_per_category=eval_config.include_metrics_per_category)
TypeError: get_eval_metric_ops_for_evaluators() got an unexpected keyword argument 'include_metrics_per_category'
Does anybody know how to get rid the error?
TypeError: get_eval_metric_ops_for_evaluators() got an unexpected keyword argument 'include_metrics_per_category'
I was able to fix it by switching to Python 2 in Runtime settings.
But still don't know how to fix it on Python 3 :|
I got an error when I changed new images to train the im2txt model. Don't know why.
Build the model.
bazel build -c opt im2txt/...
bazel-bin/im2txt/train
--input_file_pattern="${MY_DATA_DIR}/train-?????-of-00256"
--inception_checkpoint_file="${INCEPTION_CHECKPOINT}"
--train_dir="${MODEL_DIR}/train"
--train_inception=false
--number_of_steps=10000
It went to error when running below sentence
sequence_length = tf.reduce_sum(self.input_mask, 1)
lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell,
inputs=self.seq_embeddings,
sequence_length=sequence_length,
initial_state=initial_state,
dtype=tf.float32,
scope=lstm_scope)
The detail info is below
INFO:tensorflow:global_step/sec: 0
INFO:tensorflow:global step 1: loss = 9.5415 (37.21 sec/step)
INFO:tensorflow:global step 2: loss = 6.6332 (12.90 sec/step)
INFO:tensorflow:global step 3: loss = 3.1327 (13.01 sec/step)
INFO:tensorflow:global step 4: loss = 6.2893 (12.04 sec/step)
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.UnimplementedError'>, TensorArray has size zero, but element shape is not fully defined. Currently only static shapes are supported when packing zero-size TensorArrays.
[[Node: OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3 = TensorArrayGatherV3[_class=["loc:#lstm/lstm/TensorArray_1"], dtype=DT_FLOAT, element_shape=, _device="/job:localhost/replica:0/task:0/cpu:0"](OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/TensorArrayGradV3, lstm/lstm/TensorArrayUnstack/range, OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/gradient_flow)]]
Caused by op u'OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3', defined at:
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 155, in
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 135, in main
learning_rate_decay_fn=learning_rate_decay_fn)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/optimizers.py", line 226, in optimize_loss
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 345, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients_impl.py", line 482, in gradients
in_grads = grad_fn(op, *out_grads)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_grad.py", line 186, in _TensorArrayScatterGrad
grad = g.gather(indices)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_ops.py", line 328, in gather
element_shape=element_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 2226, in _tensor_array_gather_v3
element_shape=element_shape, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1226, in init
self._traceback = _extract_stack()
...which was originally created as op u'lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3', defined at:
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 155, in
tf.app.run()
[elided 0 identical lines from previous traceback]
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(sys.argv[:1] + flags_passthrough))
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 89, in main
model.build()
File "/data/projects/content_creator/image2text/im2txt/im2txt/show_and_tell_model.py", line 437, in build
self.build_model()
File "/data/projects/content_creator/image2text/im2txt/im2txt/show_and_tell_model.py", line 356, in build_model
scope=lstm_scope)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/rnn.py", line 546, in dynamic_rnn
dtype=dtype)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/rnn.py", line 664, in dynamic_rnn_loop
for ta, input in zip(input_ta, flat_input))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/rnn.py", line 664, in
for ta, input in zip(input_ta, flat_input))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_ops.py", line 380, in unstack
indices=math_ops.range(0, num_elements), value=value, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_ops.py", line 408, in scatter
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 2492, in _tensor_array_scatter_v3
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
UnimplementedError (see above for traceback): TensorArray has size zero, but element shape is not fully defined. Currently only static shapes are supported when packing zero-size TensorArrays.
[[Node: OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3 = TensorArrayGatherV3[_class=["loc:#lstm/lstm/TensorArray_1"], dtype=DT_FLOAT, element_shape=, _device="/job:localhost/replica:0/task:0/cpu:0"](OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/TensorArrayGradV3, lstm/lstm/TensorArrayUnstack/range, OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/gradient_flow)]]
Traceback (most recent call last):
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 155, in
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 152, in main
saver=saver)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 793, in train
train_step_kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 530, in train_step
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnimplementedError: TensorArray has size zero, but element shape is not fully defined. Currently only static shapes are supported when packing zero-size TensorArrays.
[[Node: OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3 = TensorArrayGatherV3[_class=["loc:#lstm/lstm/TensorArray_1"], dtype=DT_FLOAT, element_shape=, _device="/job:localhost/replica:0/task:0/cpu:0"](OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/TensorArrayGradV3, lstm/lstm/TensorArrayUnstack/range, OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/gradient_flow)]]
Caused by op u'OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3', defined at:
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 155, in
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 135, in main
learning_rate_decay_fn=learning_rate_decay_fn)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/optimizers.py", line 226, in optimize_loss
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 345, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients_impl.py", line 482, in gradients
in_grads = grad_fn(op, *out_grads)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_grad.py", line 186, in _TensorArrayScatterGrad
grad = g.gather(indices)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_ops.py", line 328, in gather
element_shape=element_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 2226, in _tensor_array_gather_v3
element_shape=element_shape, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1226, in init
self._traceback = _extract_stack()
...which was originally created as op u'lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3', defined at:
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 155, in
tf.app.run()
[elided 0 identical lines from previous traceback]
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(sys.argv[:1] + flags_passthrough))
File "/data/projects/content_creator/image2text/im2txt/bazel-bin/im2txt/train.runfiles/im2txt/im2txt/train.py", line 89, in main
model.build()
File "/data/projects/content_creator/image2text/im2txt/im2txt/show_and_tell_model.py", line 437, in build
self.build_model()
File "/data/projects/content_creator/image2text/im2txt/im2txt/show_and_tell_model.py", line 356, in build_model
scope=lstm_scope)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/rnn.py", line 546, in dynamic_rnn
dtype=dtype)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/rnn.py", line 664, in dynamic_rnn_loop
for ta, input in zip(input_ta, flat_input))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/rnn.py", line 664, in
for ta, input in zip(input_ta, flat_input))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_ops.py", line 380, in unstack
indices=math_ops.range(0, num_elements), value=value, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/tensor_array_ops.py", line 408, in scatter
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 2492, in _tensor_array_scatter_v3
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
UnimplementedError (see above for traceback): TensorArray has size zero, but element shape is not fully defined. Currently only static shapes are supported when packing zero-size TensorArrays.
[[Node: OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGatherV3 = TensorArrayGatherV3[_class=["loc:#lstm/lstm/TensorArray_1"], dtype=DT_FLOAT, element_shape=, _device="/job:localhost/replica:0/task:0/cpu:0"](OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/TensorArrayGradV3, lstm/lstm/TensorArrayUnstack/range, OptimizeLoss/gradients/lstm/lstm/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3_grad/TensorArrayGrad/gradient_flow)]]
I am trying to reverse my inputs with array_ops.reverse_sequence() before sending it to dynamic_rnn(), the inference graph can be build with no problem, but when building the training graph, I got the following error:
Traceback (most recent call last):
File "bin/trainer.py", line 158, in <module>
kmer_len=args.kmer_len)
File "/home/ubuntu/GIT/IvyMike/ivymike/base_model.py", line 193, in run_training
train_op = model.training(loss, learning_rate)
File "/home/ubuntu/GIT/IvyMike/ivymike/base_model.py", line 100, in training
train_op = optimizer.minimize(loss, global_step=global_step)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 190, in minimize
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 241, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gradients.py", line 481, in gradients
in_grads = _AsList(grad_fn(op, *out_grads))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/array_grad.py", line 307, in _ReverseSequenceGrad
seq_lengths=seq_lengths),
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1143, in reverse_sequence
batch_dim=batch_dim, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2119, in create_op
set_shapes_for_outputs(ret)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1586, in set_shapes_for_outputs
shapes = shape_func(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/array_ops.py", line 1257, in _ReverseSequenceShape
(batch_dim, input_shape.ndims))
TypeError: %d format: a number is required, not NoneType
Any idea what went wrong?
This has been fixed at the mater in TensorFlow