Estimator.predict() has Shape Issues? - tensorflow

I can train and evalaute a Tensorflow Estimator model without any problems. When I do prediction, this error arises:
InvalidArgumentError (see above for traceback): output_shape has incorrect number of elements: 68 should be: 2
[[Node: output = SparseToDense[T=DT_INT32, Tindices=DT_INT32, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ToInt32, ToInt32_1, ToInt32_2, bidirectional_rnn/bidirectional_rnn/fw/fw/time)]]
All of the model functions use the same architecture:
def _train_model_fn(features, labels, mode, params):
features = _network_fn(features, mode, params)
outputs = _get_output(features, params["output_layer"],
params["num_classes"])
predictions = {
"outputs": outputs
}
... # loss initialization and whatnot
def _eval_model_fn(features, labels, mode, params):
features = _network_fn(features, mode, params)
outputs = _get_output(features, params["output_layer"], params["num_classes"])
predictions = {
"outputs": outputs
}
... # loss initialization and whatnot
def _predict_model_fn(features, mode, params):
features = _network_fn(features, mode, params)
outputs = _get_output(features, params["output_layer"], params["num_classes"])
predictions = {
"outputs": outputs
}
...
Here's the predict code:
def predict(params, features, checkpoint_dir):
estimator = tf.estimator.Estimator(model_fn=_predict_model_fn,
params=params,
model_dir=checkpoint_dir)
predictions = estimator.predict(input_fn=_input_fn(features))
for i, p in enumerate(predictions):
print(i, p)
I also checked the shapes given every time the input passes a layer when training, and the same thing for predicting. They give the same shapes:
Training:
conv2d [1, 358, 358, 16]
max_pool2d [1, 179, 179, 16]
collapse_to_rnn_dims [1, 179, 2864]
birnn [1, 179, 64]
Prediction:
conv2d [1, 358, 358, 16]
max_pool2d [1, 179, 179, 16]
collapse_to_rnn_dims [1, 179, 2864]
birnn [1, 179, 64]
Here are the SparseTensors I passed to sparse_to_dense:
Training:
SparseTensor(indices=Tensor("CTCBeamSearchDecoder:0", shape=(?, 2), dtype=int64), values=Tensor("CTCBeamSearchDecoder:1", shape=(?,), dtype=int64), dense_shape=Tensor("CTCBeamSearchDecoder:2", shape=(2,), dtype=int64))
Evaluation:
SparseTensor(indices=Tensor("CTCBeamSearchDecoder:0", shape=(?, 2), dtype=int64), values=Tensor("CTCBeamSearchDecoder:1", shape=(?,), dtype=int64), dense_shape=Tensor("CTCBeamSearchDecoder:2", shape=(2,), dtype=int64))
Prediction:
SparseTensor(indices=Tensor("CTCBeamSearchDecoder:0", shape=(?, 2), dtype=int64), values=Tensor("CTCBeamSearchDecoder:1", shape=(?,), dtype=int64), dense_shape=Tensor("CTCBeamSearchDecoder:2", shape=(2,), dtype=int64))
Which are all pretty much the same.
Any reason why this is happening? Shouldn't the _predict_model_fn work given that it follows the same architecture as that of the other model_fns?
Here's the full stacktrace:
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_service': None, '_save_summary_steps': 100, '_model_dir': 'checkpoint\\model-20180419-150303', '_task_id': 0, '_evaluation_master': '', '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000091F58B3080>, '_num_ps_replicas': 0, '_master': '', '_save_checkpoints_secs': 600, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_global_id_in_cluster': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from checkpoint\model-20180419-150303\model.ckpt-1
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Process Process-2:
Traceback (most recent call last):
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1361, in _do_call
return fn(*args)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _run_fn
target_list, status, run_metadata)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: output_shape has incorrect number of elements: 68 should be: 2
[[Node: output = SparseToDense[T=DT_INT32, Tindices=DT_INT32, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ToInt32, ToInt32_1, ToInt32_2, bidirectional_rnn/bidirectional_rnn/fw/fw/time)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\asus.11\Anaconda3\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
File "C:\Users\asus.11\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\train_ocr.py", line 42, in evaluate_model
evaluate(architecture_params, images, labels, checkpoint_dir)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 82, in evaluate
predict(params, features, checkpoint_dir)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 90, in predict
for i, p in enumerate(predictions):
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 492, in predict
preds_evaluated = mon_sess.run(predictions)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 546, in run
run_metadata=run_metadata)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1022, in run
run_metadata=run_metadata)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1113, in run
raise six.reraise(*original_exc_info)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\six.py", line 693, in reraise
raise value
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1098, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1170, in run
run_metadata=run_metadata)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 950, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 905, in run
run_metadata_ptr)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1137, in _run
feed_dict_tensor, options, run_metadata)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1355, in _do_run
options, run_metadata)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1374, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: output_shape has incorrect number of elements: 68 should be: 2
[[Node: output = SparseToDense[T=DT_INT32, Tindices=DT_INT32, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ToInt32, ToInt32_1, ToInt32_2, bidirectional_rnn/bidirectional_rnn/fw/fw/time)]]
Caused by op 'output', defined at:
File "<string>", line 1, in <module>
File "C:\Users\asus.11\Anaconda3\lib\multiprocessing\spawn.py", line 106, in spawn_main
exitcode = _main(fd)
File "C:\Users\asus.11\Anaconda3\lib\multiprocessing\spawn.py", line 119, in _main
return self._bootstrap()
File "C:\Users\asus.11\Anaconda3\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
File "C:\Users\asus.11\Anaconda3\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\train_ocr.py", line 42, in evaluate_model
evaluate(architecture_params, images, labels, checkpoint_dir)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 82, in evaluate
predict(params, features, checkpoint_dir)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 90, in predict
for i, p in enumerate(predictions):
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 479, in predict
features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 793, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 217, in _predict_model_fn
outputs = _get_output(features, params["output_layer"], params["num_classes"])
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 134, in _get_output
return _sparse_to_dense(decoded, name="output")
File "C:\Users\asus.11\Documents\Optimized_OCR\trainer\backend\tf\experiment_ops.py", line 38, in _sparse_to_dense
name=name)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\ops\sparse_ops.py", line 791, in sparse_to_dense
name=name)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_sparse_ops.py", line 2401, in _sparse_to_dense
name=name)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3271, in create_op
op_def=op_def)
File "C:\Users\asus.11\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1650, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): output_shape has incorrect number of elements: 68 should be: 2
[[Node: output = SparseToDense[T=DT_INT32, Tindices=DT_INT32, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ToInt32, ToInt32_1, ToInt32_2, bidirectional_rnn/bidirectional_rnn/fw/fw/time)]]
Update
I tried using the same architecture in a different training run, I encountered a different shap error:
InvalidArgumentError (see above for traceback): output_shape has incorrect number of elements: 69 should be: 2
[[Node: output = SparseToDense[T=DT_INT32, Tindices=DT_INT32, validate_indices=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](ToInt32, ToInt32_1, ToInt32_2, bidirectional_rnn/bidirectional_rnn/fw/fw/time)]]
Apparently, the problem seems to lie in the ctc_beam_search_decoder. Switching to ctc_greedy_decoder doesn't help either. Why is it doing this?
More updates
I have uploaded the reproducible example: https://github.com/selcouthlyBlue/ShapeErrorReproduce

I have finally figured out the error. The problem actually lies in the way I used sparse_to_dense. Apparently, the order I gave is wrong where the values came first before the shape:
return tf.sparse_to_dense(tf.to_int32(decoded[0].indices),
tf.to_int32(decoded[0].values),
tf.to_int32(decoded[0].dense_shape),
name="output")
The order should be (shape comes first before values):
return tf.sparse_to_dense(tf.to_int32(decoded[0].indices),
tf.to_int32(decoded[0].dense_shape),
tf.to_int32(decoded[0].values),
name="output")

Related

tensorflow tf.cond does not execute true_fn or false_fn for tf.reduce_mean

I am trying to condition the output of the loss function tf.reduce_mean so as to avoid NaN errors. My code is:
limit=[]
for i in xrange(12):
limit.append(10000.0)
limit = tf.constant(limit)
predictions["loss"] =tf.cond(tf.reduce_mean(
(prediction - transformed_values) ** 2, axis=-1) < limit,
lambda:tf.reduce_mean(
(prediction - transformed_values) ** 2, axis=-1),
lambda:tf.reduce_mean(
(prediction - transformed_values), axis=-1)).
However, I get the error
INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpfnvr6j
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7eaa5bd750>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpfnvr6j', '_save_summary_steps': 100}
shape: pred (12,) true_t (12,) false_t (12,)
Traceback (most recent call last):
File "/home/paul/workspace/workspace/Master/Elec_Price_Prediction/Time_Series.py", line 302, in <module>
obtain_prediction()
File "/home/paul/workspace/workspace/Master/Elec_Price_Prediction/Time_Series.py", line 212, in obtain_prediction
estimator.train(input_fn=train_input_fn, steps=10000)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 302, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 711, in _train_model
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 694, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/head.py", line 201, in create_estimator_spec
return self._train_ops(features)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/head.py", line 60, in _train_ops
estimator_lib.ModeKeys.TRAIN)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/state_management.py", line 67, in define_loss
return model.define_loss(features, mode)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/model.py", line 196, in define_loss
return self.get_batch_loss(features=features, mode=mode, state=start_state)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/model.py", line 509, in get_batch_loss
features, mode, state)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/model.py", line 609, in per_step_batch_loss
outputs=["loss"] + self._train_output_names)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/model.py", line 775, in _state_update_loop
loop_vars=initial_loop_arguments)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2816, in while_loop
result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2640, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 2590, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/model.py", line 726, in _state_update_step
state=state)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/timeseries/python/timeseries/model.py", line 605, in _batch_loss_filtering_step
predictions=predictions)
File "/home/paul/workspace/workspace/Master/Elec_Price_Prediction/Time_Series.py", line 105, in _filtering_step
prediction=tf.cond(pred,lambda:true_t,lambda:false_t)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/deprecation.py", line 316, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 1844, in cond
p_2, p_1 = switch(pred, pred)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/control_flow_ops.py", line 305, in switch
return gen_control_flow_ops._switch(data, pred, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_control_flow_ops.py", line 562, in _switch
"Switch", data=data, pred=pred, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2958, in create_op
set_shapes_for_outputs(ret)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2209, in set_shapes_for_outputs
shapes = shape_func(op)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2159, in call_with_requiring
return call_cpp_shape_fn(op, require_shape_fn=True)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/common_shapes.py", line 627, in call_cpp_shape_fn
require_shape_fn)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/common_shapes.py", line 691, in _call_cpp_shape_fn_impl
raise ValueError(err.message)
ValueError: Shape must be rank 0 but is rank 1 for 'head/model/while/state_update_step/cond/Switch' (op: 'Switch') with input shapes: [12], [12].
My question would be why this is impossible and how to work around it. I tried checking if pred and true_fn as well as false_fn have the same shape and they do.
I prefer tf.where. How about using tf.where?

tensorflow estimator from_generator, how to set TensorShape?

I am trying use a generator to feed data into estimator. The following is the code. However, when try to run, I got the following error:
Update2: I finally made it work. So the correct tensorshape is
([], [], [])
Update: I added tensorshape ([None], [None], [None]), then I changed ds.batch(10), to an assignment ds = ds.batch(10)
but still got error.
Traceback (most recent call last):
File "xyz.py", line 79, in <module>
tf.app.run(main=main, argv=None)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "xyz.py", line 67, in main
model.train(input_fn=lambda: input_fn(100))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 302, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 783, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 521, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 892, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 967, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 952, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1024, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 827, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 889, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1120, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1317, in _do_run
options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1336, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: exceptions.ValueError: `generator` yielded an element of shape () where an element of shape (?,) was expected.
[[Node: PyFunc = PyFunc[Tin=[DT_INT64], Tout=[DT_INT64, DT_STRING, DT_FLOAT], token="pyfunc_1"](arg0)]]
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,?], [?,?], [?,?]], output_types=[DT_INT64, DT_STRING, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator)]]
So my question, how to set the TensorShape? The from generator takes a third argument of TensorShape but I cannot find any example/doc on how to set it. Any help?
Thanks,
def gen(nn):
ii = 0
while ii < nn:
ii += 1
yield ii, 't{0}'.format(ii), ii*2
def input_fn(n):
ds = tf.data.Dataset.from_generator(lambda: gen(n), (tf.int64, tf.string, tf.float32), ([None], [None], [None]))
ds = ds.batch(10)
x, y, z = ds.make_one_shot_iterator().get_next()
return {'x': x, 'y': y}, tf.greater_equal(z, 10)
def build_columns():
x = tf.feature_column.numeric_column('x')
y = tf.feature_column.categorical_column_with_hash_bucket('y', hash_bucket_size=5)
return [x, y]
def build_estimator():
run_config = tf.estimator.RunConfig().replace(
session_config=tf.ConfigProto(device_count={'GPU': 0}))
return tf.estimator.LinearClassifier(model_dir=FLAGS.model_dir, feature_columns=build_columns(), config=run_config)
def main(unused):
# Clean up the model directory if present
shutil.rmtree(FLAGS.model_dir, ignore_errors=True)
model = build_estimator()
# Train and evaluate the model every `FLAGS.epochs_per_eval` epochs.
for n in range(FLAGS.train_epochs // FLAGS.epochs_per_eval):
model.train(input_fn=lambda: input_fn(100))
results = model.evaluate(input_fn=lambda: input_fn(20))
As mentioned by #FengTian in an update, the correct answer was to use shape ([], [], []) as the output shape of the generator:
tf.data.Dataset.from_generator(lambda: gen(n), (tf.int64, tf.string, tf.float32), ([], [], []))

Tensorboard exception with summary.image of shape [-1, 125, 128, 1] of MFCCs

Following this guide, I'm converting a tensor [batch_size, 16000, 1] to an MFCC using the method described in the link:
def gen_spectrogram(wav, sr=16000):
# A 1024-point STFT with frames of 64 ms and 75% overlap.
stfts = tf.contrib.signal.stft(wav, frame_length=1024, frame_step=256, fft_length=1024)
spectrograms = tf.abs(stfts)
# Warp the linear scale spectrograms into the mel-scale.
num_spectrogram_bins = stfts.shape[-1].value
lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
num_mel_bins, num_spectrogram_bins,
sample_rate, lower_edge_hertz, upper_edge_hertz)
mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
mel_spectrograms.set_shape(
spectrograms.shape[:-1].concatenate(
linear_to_mel_weight_matrix.shape[-1:]
)
)
# Compute a stabilized log to get log-magnitude mel-scale spectrograms.
log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
# Compute MFCCs from log_mel_spectrograms and take the first 13.
return tf.contrib.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :13]
I then reshape the output of that to [batch_size, 125, 128, 1]. If I send that to a tf.layers.conv2d, things seem to work fine. However, if I try to tf.summary.image, I get the following error:
print(spec)
// => Tensor("spectrogram/Reshape:0", shape=(?, 125, 128, 1), dtype=float32)
tf.summary.image('spec', spec)
Caused by op u'spectrogram/stft/rfft', defined at:
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/Users/rsilveira/rnd/ml-engine/trainer/flatv1.py", line 103, in <module>
runner.run(model_fn)
File "trainer/runner.py", line 88, in run
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
File "/Library/Python/2.7/site-packages/tensorflow/python/estimator/training.py", line 432, in train_and_evaluate
executor.run_local()
File "/Library/Python/2.7/site-packages/tensorflow/python/estimator/training.py", line 611, in run_local
hooks=train_hooks)
File "/Library/Python/2.7/site-packages/tensorflow/python/estimator/estimator.py", line 302, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/Library/Python/2.7/site-packages/tensorflow/python/estimator/estimator.py", line 711, in _train_model
features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
File "/Library/Python/2.7/site-packages/tensorflow/python/estimator/estimator.py", line 694, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/Users/rsilveira/rnd/ml-engine/trainer/flatv1.py", line 53, in model_fn
spec = gen_spectrogram(x)
File "/Users/rsilveira/rnd/ml-engine/trainer/flatv1.py", line 22, in gen_spectrogram
step,
File "/Library/Python/2.7/site-packages/tensorflow/contrib/signal/python/ops/spectral_ops.py", line 91, in stft
return spectral_ops.rfft(framed_signals, [fft_length])
File "/Library/Python/2.7/site-packages/tensorflow/python/ops/spectral_ops.py", line 136, in _rfft
return fft_fn(input_tensor, fft_length, name)
File "/Library/Python/2.7/site-packages/tensorflow/python/ops/gen_spectral_ops.py", line 619, in rfft
"RFFT", input=input, fft_length=fft_length, name=name)
File "/Library/Python/2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
op_def=op_def)
File "/Library/Python/2.7/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Input dimension 4 must have length of at least 512 but got: 320
Not sure where to start troubleshooting this. What am I missing here?

Training model from logits and checkpoint

I am new in tensor flow and I am trying to train the mobile net_v1. To do that, I first created the tfrecords' file for multi-class from a txt file.( example : namefile label1 label2 ...)
import sys, os
import tensorflow as tf
import cv2
import numpy as np
import matplotlib.pyplot as plt
# function
def load_image(addr):
# read an image and resize to (224, 224)
# cv2 load images as BGR, convert it to RGB
img = cv2.imread(addr)
img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32)
return img
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[*value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def loadData(inputs):
addrs = []
labels = []
f = open(inputs, 'r')
data = [ln.split(' ') for ln in f ]
f.close()
print(data)
for i in range(0, len(data)):
addrs.append(data[i][0].rstrip())
l = []
for j in range(1,len(data[i])):
if(data[i][j].rstrip().isdigit() == True):
l.append(int(data[i][j].rstrip()))
print(l)
labels.append(l)
return addrs, labels
def CreateTrainFile(input_filename, train_filename,):
path = '/home/rd/Documents/RD2/Databases/Faces/'
# load file and label
train_addrs, train_labels = loadData(input_filename)
print(train_labels)
# open the TFRecords file
writer = tf.python_io.TFRecordWriter(train_filename)
for i in range(len(train_addrs)):
# print how many images are saved every 1000 images
if not i % 1000:
print('Train data: {}/{}'.format(i, len(train_addrs)))
sys.stdout.flush()
# Load the image
img = load_image(train_addrs[i])
label = train_labels[i]
print('label : ', _int64_feature(label))
# Create a feature
feature = {'train/label': _int64_feature(label),
'train/image': _bytes_feature(tf.compat.as_bytes(img.tostring()))}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
# open the TFRecords file
def CreateValidationFile(val_filename):
writer = tf.python_io.TFRecordWriter(val_filename)
for i in range(len(val_addrs)):
# print how many images are saved every 1000 images
if not i % 1000:
print('Val data: {}/{}'.format(i, len(val_addrs)))
sys.stdout.flush()
# Load the image
img = load_image(val_addrs[i])
label = val_labels[i]
# Create a feature
feature = {'val/label': _int64_feature(label),
'val/image': _bytes_feature(tf.compat.as_bytes(img.tostring()))}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
# open the TFRecords file
def CreateTestFile(test_filename):
writer = tf.python_io.TFRecordWriter(test_filename)
for i in range(len(test_addrs)):
# print how many images are saved every 1000 images
if not i % 1000:
print('Test data: {}/{}'.format(i, len(test_addrs)))
sys.stdout.flush()
# Load the image
img = load_image(test_addrs[i])
label = test_labels[i]
# Create a feature
feature = {'test/label': _int64_feature(label),
'test/image': _bytes_feature(tf.compat.as_bytes(img.tostring()))}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
def ReadRecordFileTrain(data_path):
#data_path = 'train.tfrecords' # address to save the hdf5 file
with tf.Session() as sess:
feature = {'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64)}
# Create a list of filenames and pass it to a queue
filename_queue = tf.train.string_input_producer([data_path], num_epochs=1)
# Define a reader and read the next record
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Decode the record read by the reader
features = tf.parse_single_example(serialized_example, features=feature)
# Convert the image data from string back to the numbers
image = tf.decode_raw(features['train/image'], tf.float32)
# Cast label data into int32
label = tf.cast(features['train/label'], tf.int32)
# Reshape image data into the original shape
image = tf.reshape(image, [224, 224, 3])
# Any preprocessing here ...
# Creates batches by randomly shuffling tensors
images, labels = tf.train.shuffle_batch([image, label], batch_size=2, capacity=30, num_threads=1, min_after_dequeue=10)
return images, labels
def main():
train_filename = 'train.tfrecords' # address to save the TFRecords file
#test_filename = 'test.tfrecords' # address to save the TFRecords file
#val_filename = 'val.tfrecords' # address to save the TFRecords file
CreateTrainFile("data.txt", train_filename)
main()
and to read the tf records :
def ReadRecordFileTrain(data_path):
#data_path = 'train.tfrecords' # address to save the hdf5 file
with tf.Session() as sess:
feature = {'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([2], tf.int64)}
# Create a list of filenames and pass it to a queue
filename_queue = tf.train.string_input_producer([data_path], num_epochs=1)
# Define a reader and read the next record
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Decode the record read by the reader
features = tf.parse_single_example(serialized_example, features=feature)
# Convert the image data from string back to the numbers
image = tf.decode_raw(features['train/image'], tf.float32)
print('label1 :', features['train/label'] )
# Cast label data into int32
label = tf.cast(features['train/label'], tf.int32)
print('label load:', label)
# Reshape image data into the original shape
image = tf.reshape(image, [224, 224, 3])
# Any preprocessing here ...
# Creates batches by randomly shuffling tensors
images, labels = tf.train.batch([image, label], batch_size=2, capacity=30, num_threads=1)
return images, labels
I suppose it works but I am not sure ( I don't have any errors when I called these functions.)
Then, I load the model and its weight. Call the loss function and try to start the training, but it fails at this moment.
g = tf.Graph()
with g.as_default():
# size of the folder
inputs = tf.placeholder(tf.float32, [1, 224, 224, 3])
# load dataset
images, labels = ReadRecordFileTrain('train.tfrecords')
print('load dataset done')
print('labels = ', labels)
print('data = ', images)
print(tf.shape(labels))
# load network
network, end_points= mobilenet.mobilenet_v1(images, num_classes=2, depth_multiplier=0.25 )
print('load network done')
print('network : ', network)
variables_to_restore = slim.get_variables_to_restore(exclude=["MobilenetV1/Logits/Conv2d_1c_1x1"])
load_checkpoint = "modele_mobilenet_v1_025/mobilenet_v1_0.25_224.ckpt"
init_fn = slim.assign_from_checkpoint_fn(load_checkpoint, variables_to_restore)
print('custom network done')
# Specify the loss function:
tf.losses.softmax_cross_entropy(labels, network)
total_loss = tf.losses.get_total_loss()
#tf.scalar_summary('losses/total_loss', total_loss)
# Specify the optimization scheme:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=.001)
# create_train_op that ensures that when we evaluate it to get the loss,
# the update_ops are done and the gradient updates are computed.
train_tensor = slim.learning.create_train_op(total_loss, optimizer)
print('loss and optimizer chosen')
# Actually runs training.
save_checkpoint = 'model/modelcheck'
# start training
learning = slim.learning.train(train_tensor, save_checkpoint, init_fn=init_fn, number_of_steps=1000)
The error message :
label1 : Tensor("ParseSingleExample/Squeeze_train/label:0", shape=(2,), dtype=int64)
label load: Tensor("Cast:0", shape=(2,), dtype=int32)
load dataset done
labels = Tensor("batch:1", shape=(2, 2), dtype=int32)
data = Tensor("batch:0", shape=(2, 224, 224, 3), dtype=float32)
Tensor("Shape:0", shape=(2,), dtype=int32)
load network done
network : Tensor("MobilenetV1/Logits/SpatialSqueeze:0", shape=(2, 2), dtype=float32)
custom network done
loss and optimizer chosen
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1039, in _do_call
return fn(*args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1021, in _run_fn
status, run_metadata)
File "/usr/lib/python3.5/contextlib.py", line 66, in __exit__
next(self.gen)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [1,1,256,2] rhs shape= [1,1,256,1]
[[Node: save_1/Assign_109 = Assign[T=DT_FLOAT, _class=["loc:#MobilenetV1/Logits/Conv2d_1c_1x1/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/cpu:0"](MobilenetV1/Logits/Conv2d_1c_1x1/weights, save_1/RestoreV2_109)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 106, in <module>
main()
File "test.py", line 103, in main
learning = slim.learning.train(train_tensor, save_checkpoint, init_fn=init_fn, number_of_steps=1000)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 725, in train
master, start_standard_services=False, config=session_config) as sess:
File "/usr/lib/python3.5/contextlib.py", line 59, in __enter__
return next(self.gen)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/supervisor.py", line 960, in managed_session
self.stop(close_summary_writer=close_summary_writer)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/supervisor.py", line 788, in stop
stop_grace_period_secs=self._stop_grace_secs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 389, in join
six.reraise(*self._exc_info_to_raise)
File "/usr/lib/python3/dist-packages/six.py", line 686, in reraise
raise value
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/supervisor.py", line 949, in managed_session
start_standard_services=start_standard_services)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/supervisor.py", line 706, in prepare_or_wait_for_session
init_feed_dict=self._init_feed_dict, init_fn=self._init_fn)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/session_manager.py", line 256, in prepare_session
config=config)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/session_manager.py", line 188, in _restore_checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1457, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 778, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 982, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1032, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1052, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [1,1,256,2] rhs shape= [1,1,256,1]
[[Node: save_1/Assign_109 = Assign[T=DT_FLOAT, _class=["loc:#MobilenetV1/Logits/Conv2d_1c_1x1/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/cpu:0"](MobilenetV1/Logits/Conv2d_1c_1x1/weights, save_1/RestoreV2_109)]]
Caused by op 'save_1/Assign_109', defined at:
File "test.py", line 106, in <module>
main()
File "test.py", line 103, in main
learning = slim.learning.train(train_tensor, save_checkpoint, init_fn=init_fn, number_of_steps=1000)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 642, in train
saver = saver or tf_saver.Saver()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1056, in __init__
self.build()
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 1086, in build
restore_sequentially=self._restore_sequentially)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 691, in build
restore_sequentially, reshape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 419, in _AddRestoreOps
assign_ops.append(saveable.restore(tensors, shapes))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py", line 155, in restore
self.op.get_shape().is_fully_defined())
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/state_ops.py", line 270, in assign
validate_shape=validate_shape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_state_ops.py", line 47, in assign
use_locking=use_locking, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): Assign requires shapes of both tensors to match. lhs shape= [1,1,256,2] rhs shape= [1,1,256,1]
[[Node: save_1/Assign_109 = Assign[T=DT_FLOAT, _class=["loc:#MobilenetV1/Logits/Conv2d_1c_1x1/weights"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/cpu:0"](MobilenetV1/Logits/Conv2d_1c_1x1/weights, save_1/RestoreV2_109)]]
I don't understand where the problem comes from and how to solve it.
InvalidArgumentError: Assign requires shapes of both tensors to match.
lhs shape= [1,1,256,2] rhs shape= [1,1,256,1]
I used to get this error when my model saved in the model directory has conflicts with my current running model. Try deleting your model directory and start training again.
It seems to solve the error but now, when I want to execute it with a tf.Session it failed. I was wondering if the problem comes from my graph or am I doing something wrong in the tf.Session ?
def evaluation(logits, labels):
with tf.name_scope('Accuracy'):
# Operation comparing prediction with true label
correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels, 1))
# Operation calculating the accuracy of the predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Summary operation for the accuracy
#tf.scalar_summary('train_accuracy', accuracy)
return accuracy
g = tf.Graph()
with g.as_default():
# size of the folder
inputs = tf.placeholder(tf.float32, [1, 224, 224, 3])
# load dataset
images, labels = ReadRecordFileTrain('train.tfrecords')
print('load dataset done')
print('labels = ', labels)
print('data = ', images)
print(tf.shape(labels))
# load network
network, end_points= mobilenet.mobilenet_v1(images, num_classes=2, depth_multiplier=0.25 )
print('load network done')
print('network : ', network)
variables_to_restore = slim.get_variables_to_restore(exclude=["MobilenetV1/Logits/Conv2d_1c_1x1"])
load_checkpoint = "modele_mobilenet_v1_025/mobilenet_v1_0.25_224.ckpt"
init_fn = slim.assign_from_checkpoint_fn(load_checkpoint, variables_to_restore)
print('custom network done')
# Specify the loss function:
tf.losses.softmax_cross_entropy(labels, network)
total_loss = tf.losses.get_total_loss()
#tf.scalar_summary('losses/total_loss', total_loss)
# Specify the optimization scheme:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=.001)
# create_train_op that ensures that when we evaluate it to get the loss,
# the update_ops are done and the gradient updates are computed.
train_tensor = slim.learning.create_train_op(total_loss, optimizer)
print('loss and optimizer chosen')
# Actually runs training.
save_checkpoint = 'model/modelcheck'
# start training
learning = slim.learning.train(train_tensor, save_checkpoint, init_fn=init_fn, number_of_steps=1000)
accuracy = evaluation(network, labels)
with tf.Session(graph=g) as sess:
sess.run(network)
print('network load')
sess.run(total_loss)
sess.run(accuracy)
sess.run(train_tensor)
sess.run(learning)
The error :
label1 : Tensor("ParseSingleExample/Squeeze_train/label:0", shape=(2,), dtype=int64)
label load: Tensor("Cast:0", shape=(2,), dtype=int32)
load dataset done
labels = Tensor("batch:1", shape=(4, 2), dtype=int32)
data = Tensor("batch:0", shape=(4, 224, 224, 3), dtype=float32)
Tensor("Shape:0", shape=(2,), dtype=int32)
load network done
network : Tensor("MobilenetV1/Logits/SpatialSqueeze:0", shape=(4, 2), dtype=float32)
custom network done
loss and optimizer chosen
end of graph
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1039, in _do_call
return fn(*args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1021, in _run_fn
status, run_metadata)
File "/usr/lib/python3.5/contextlib.py", line 66, in __exit__
next(self.gen)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.FailedPreconditionError: Attempting to use uninitialized value MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta
[[Node: MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta/read = Identity[T=DT_FLOAT, _class=["loc:#MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta"], _device="/job:localhost/replica:0/task:0/cpu:0"](MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 113, in <module>
main()
File "test.py", line 105, in main
sess.run(network)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 778, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 982, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1032, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1052, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.FailedPreconditionError: Attempting to use uninitialized value MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta
[[Node: MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta/read = Identity[T=DT_FLOAT, _class=["loc:#MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta"], _device="/job:localhost/replica:0/task:0/cpu:0"](MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta)]]
Caused by op 'MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta/read', defined at:
File "test.py", line 113, in <module>
main()
File "test.py", line 67, in main
network, end_points= mobilenet.mobilenet_v1(images, num_classes=2, depth_multiplier=0.25 )
File "/home/rd/Documents/RD2/users/Ludovic/tensorflow_mobilenet/mobilenet_v1.py", line 301, in mobilenet_v1
conv_defs=conv_defs)
File "/home/rd/Documents/RD2/users/Ludovic/tensorflow_mobilenet/mobilenet_v1.py", line 228, in mobilenet_v1_base
scope=end_point)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1891, in separable_convolution2d
outputs = normalizer_fn(outputs, **normalizer_params)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 528, in batch_norm
outputs = layer.apply(inputs, training=is_training)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 320, in apply
return self.__call__(inputs, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 286, in __call__
self.build(input_shapes[0])
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/normalization.py", line 125, in build
trainable=True)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 1049, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 948, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 349, in get_variable
validate_shape=validate_shape, use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 1389, in wrapped_custom_getter
*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 275, in variable_getter
variable_getter=functools.partial(getter, **kwargs))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 228, in _add_variable
trainable=trainable and self.trainable)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 1389, in wrapped_custom_getter
*args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1334, in layer_variable_getter
return _model_variable_getter(getter, *args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1326, in _model_variable_getter
custom_getter=getter, use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py", line 262, in model_variable
use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py", line 217, in variable
use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1334, in layer_variable_getter
return _model_variable_getter(getter, *args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1326, in _model_variable_getter
custom_getter=getter, use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py", line 262, in model_variable
use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 181, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/framework/python/ops/variables.py", line 217, in variable
use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 341, in _true_getter
use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 714, in _get_single_variable
validate_shape=validate_shape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variables.py", line 197, in __init__
expected_shape=expected_shape)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variables.py", line 316, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1338, in identity
result = _op_def_lib.apply_op("Identity", input=input, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
self._traceback = _extract_stack()
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta
[[Node: MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta/read = Identity[T=DT_FLOAT, _class=["loc:#MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta"], _device="/job:localhost/replica:0/task:0/cpu:0"](MobilenetV1/Conv2d_3_depthwise/BatchNorm/beta)]]

TensorFlow's Estimator can only get N-1 batches from tf.train.limit_epochs

Hi this is a follow up question from TensorFlow's Estimator froze with low CPU usage.
The following code works fine if the evaluate steps is 1, but if it is empty or 2, which should be the correct number of steps because there are four rows in feature_a and feature_b and the batch_size is 2, it will throw an OutOfRange Error. I suppose the Estimator should catch this OutOfRange and use it to stop the evaluation but it does not and the program exits.
import tensorflow as tf
from tensorflow.contrib.layers.python.layers.optimizers import optimize_loss
from tensorflow.contrib.learn.python.learn.estimators import model_fn
from tensorflow.contrib.learn.python.learn.estimators.estimator import Estimator
from tensorflow.python import debug as tf_debug
from tensorflow.python.framework import ops
def main(_):
hooks = [tf_debug.LocalCLIDebugHook()]
def func(features, targets, mode, params):
idx = tf.concat([features['a'], features['b']], axis=1)
embedding = tf.get_variable("embed", [10, 20], dtype=tf.float32)
pred = tf.reduce_sum(tf.nn.embedding_lookup(embedding, idx))
train_op = optimize_loss(loss=pred,
global_step=tf.train.get_global_step(),
learning_rate=0.001,
optimizer='Adam',
variables=tf.trainable_variables(),
name="training_loss_optimizer")
eval_metric_dict = dict()
eval_metric_dict['metric'] = pred
return model_fn.ModelFnOps(mode=mode,
predictions=pred,
loss=pred,
train_op=train_op,
eval_metric_ops=eval_metric_dict)
model = Estimator(func, params={})
model.fit(
input_fn=lambda: (
{'a': ops.convert_to_tensor([[1, 2, 3, 4, 5]]), 'b': ops.convert_to_tensor([[2, 3, 4, 3, 5]])},
None), max_steps=10)
testing_data_a = [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
testing_data_b = [[2, 3, 4, 3, 5], [2, 3, 4, 3, 5], [2, 3, 4, 3, 5], [2, 3, 4, 3, 5]]
def test_input_fn():
feature_a = tf.train.limit_epochs(testing_data_a, num_epochs=1)
feature_b = tf.train.limit_epochs(testing_data_b, num_epochs=1)
feature_a_producer = tf.train.batch([feature_a], batch_size=2, enqueue_many=True, allow_smaller_final_batch=True)
feature_b_producer = tf.train.batch([feature_b], batch_size=2, enqueue_many=True, allow_smaller_final_batch=True)
return {'a': feature_a_producer, 'b': feature_b_producer}, None
for i in range(10):
# This does not work
print(model.evaluate(input_fn=test_input_fn))
# This does not work
# print(model.evaluate(input_fn=test_input_fn, steps=2))
# This do work
# print(model.evaluate(input_fn=test_input_fn, steps=1))
if __name__ == "__main__":
tf.app.run()
The error stack looks like this
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpo89sneqt
2017-02-08 21:51:17.428803: W tensorflow/core/framework/op_kernel.cc:993] Out of range: FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
Traceback (most recent call last):
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1022, in _do_call
return fn(*args)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1004, in _run_fn
status, run_metadata)
File "/usr/lib/python3.4/contextlib.py", line 66, in __exit__
next(self.gen)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/errors_impl.py", line 469, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.OutOfRangeError: FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/training/python/training/evaluation.py", line 442, in evaluate_once
session.run(eval_ops, feed_dict)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 469, in run
run_metadata=run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 793, in run
run_metadata=run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 751, in run
return self._sess.run(*args, **kwargs)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 898, in run
run_metadata=run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 751, in run
return self._sess.run(*args, **kwargs)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.OutOfRangeError: FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
Caused by op 'batch', defined at:
File "/data/bshi/ProjC/estimator_test.py", line 59, in <module>
tf.app.run()
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/data/bshi/ProjC/estimator_test.py", line 55, in main
print(model.evaluate(input_fn=test_input_fn))
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/util/deprecation.py", line 281, in new_func
return func(*args, **kwargs)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 507, in evaluate
log_progress=log_progress)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 798, in _evaluate_model
features, labels = input_fn()
File "/data/bshi/ProjC/estimator_test.py", line 49, in test_input_fn
feature_a_producer = tf.train.batch([feature_a], batch_size=2, enqueue_many=True, allow_smaller_final_batch=True)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/input.py", line 917, in batch
name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/input.py", line 710, in _batch
dequeued = queue.dequeue_up_to(batch_size, name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/ops/data_flow_ops.py", line 510, in dequeue_up_to
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 1402, in _queue_dequeue_up_to_v2
timeout_ms=timeout_ms, name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 2402, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
self._traceback = _extract_stack()
OutOfRangeError (see above for traceback): FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1022, in _do_call
return fn(*args)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1004, in _run_fn
status, run_metadata)
File "/usr/lib/python3.4/contextlib.py", line 66, in __exit__
next(self.gen)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/errors_impl.py", line 469, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.OutOfRangeError: FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/data/bshi/ProjC/estimator_test.py", line 59, in <module>
tf.app.run()
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/data/bshi/ProjC/estimator_test.py", line 55, in main
print(model.evaluate(input_fn=test_input_fn))
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/util/deprecation.py", line 281, in new_func
return func(*args, **kwargs)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 507, in evaluate
log_progress=log_progress)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 825, in _evaluate_model
config=config_pb2.ConfigProto(allow_soft_placement=True))
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/training/python/training/evaluation.py", line 442, in evaluate_once
session.run(eval_ops, feed_dict)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 485, in __exit__
self._close_internal(exception_type)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/monitored_session.py", line 515, in _close_internal
h.end(self._coordinated_creator.tf_sess)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 663, in end
feed_dict=self._final_ops_feed_dict)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.OutOfRangeError: FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
Caused by op 'batch', defined at:
File "/data/bshi/ProjC/estimator_test.py", line 59, in <module>
tf.app.run()
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "/data/bshi/ProjC/estimator_test.py", line 55, in main
print(model.evaluate(input_fn=test_input_fn))
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/util/deprecation.py", line 281, in new_func
return func(*args, **kwargs)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 507, in evaluate
log_progress=log_progress)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 798, in _evaluate_model
features, labels = input_fn()
File "/data/bshi/ProjC/estimator_test.py", line 49, in test_input_fn
feature_a_producer = tf.train.batch([feature_a], batch_size=2, enqueue_many=True, allow_smaller_final_batch=True)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/input.py", line 917, in batch
name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/training/input.py", line 710, in _batch
dequeued = queue.dequeue_up_to(batch_size, name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/ops/data_flow_ops.py", line 510, in dequeue_up_to
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 1402, in _queue_dequeue_up_to_v2
timeout_ms=timeout_ms, name=name)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 2402, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/data/bshi/py3env/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
self._traceback = _extract_stack()
OutOfRangeError (see above for traceback): FIFOQueue '_0_batch/fifo_queue' is closed and has insufficient elements (requested 2, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
Process finished with exit code 1