OutOfRangeError when creating batch from tfrecord file - tensorflow

I'm writing a script that saves certain features of my data to tfrecord. The features are numpy arrays (float32). When I read the tfrecord file I get the following error:
OutOfRangeError (see above for traceback): RandomShuffleQueue '_1_shuffle_batch/random_shuffle_queue' is closed and has insufficient elements (requested 20, current size 0)
[[Node: shuffle_batch = QueueDequeueManyV2[component_types=[DT_UINT8, DT_UINT8], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/device:CPU:0"](shuffle_batch/random_shuffle_queue, shuffle_batch/n)]]
I searched a lot, and apparently this error can be caused by different things. So far, I was not able to fix it. I recreated the problem with the following minimal code:
saving the toy data:
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
writer = tf.python_io.TFRecordWriter('stuff.tfrecords')
for i in range(100):
seq = np.random.uniform(size=(500,300)).astype(np.float32)
lbl = np.random.uniform(size=(90,1)).astype(np.float32)
feature = {'train/lbl': _bytes_feature(tf.compat.as_bytes(lbl.tostring())),
'train/seq': _bytes_feature(tf.compat.as_bytes(seq.tostring()))}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
Reading the data:
def read_and_decode_single_example(filename):
filename_queue = tf.train.string_input_producer([filename], num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
f = {'train/lbl': tf.FixedLenFeature([], tf.string),
'train/seq': tf.FixedLenFeature([], tf.string)}
features = tf.parse_single_example(serialized_example, features=f)
seq = tf.decode_raw(features['train/seq'], tf.float32)
lbl = tf.decode_raw(features['train/lbl'], tf.float32)
seq = tf.reshape(seq, [ 500,300 ])
lbl = tf.reshape(lbl, [ 90,1 ])
sbatch, lbatch = tf.train.shuffle_batch([seq, lbl],
batch_size= batch_size,
capacity=3*batch_size,
min_after_dequeue=batch_size)
return sbatch, lbatch
sbatch, lbatch = read_and_decode_single_example("stuff.tfrecords" )
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
s,l = sess.run([sbatch, lbatch])
coord.request_stop()
coord.join(threads)
I'm using Tensorflow-GPU v. 1.4.0.
Here is some error code, that may be informative:
Caused by op 'shuffle_batch', defined at:
File "teststuff.py", line 59, in <module>
sbatch, lbatch = read_and_decode_single_example("stuff.tfrecords" )
File "teststuff.py", line 54, in read_and_decode_single_example
min_after_dequeue=batch_size)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/input.py", line 1225, in shuffle_batch
name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/input.py", line 796, in _shuffle_batch
dequeued = queue.dequeue_many(batch_size, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/data_flow_ops.py", line 464, in dequeue_many
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 2418, in _queue_dequeue_many_v2
component_types=component_types, timeout_ms=timeout_ms, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

Related

Feeding example to tf predictor.from_saved_model() for estimator trained with tf hub module

I try to export the model for text classification with tf hub modules, and then infer a prediction from it for a single string example using predictor.from_saved_model(). I saw some examples of similar ideas, but still couldn't make it work for the case when using tf hub modules to build features. Here is what I do:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
train_df, train_df['label_ids'], num_epochs= None, shuffle=True)
# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
train_df, train_df['label_ids'], shuffle=False)
embedded_text_feature_column = hub.text_embedding_column(
key='sentence',
module_spec='https://tfhub.dev/google/nnlm-de-dim128/1')
#Estimator
estimator = tf.estimator.DNNClassifier(
hidden_units=[500, 100],
feature_columns=[embedded_text_feature_column],
n_classes=num_of_class,
optimizer=tf.train.AdagradOptimizer(learning_rate=0.003) )
# Training
estimator.train(input_fn=train_input_fn, steps=1000)
#prediction on training set
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
print('Training set accuracy: {accuracy}'.format(**train_eval_result))
feature_spec = tf.feature_column.make_parse_example_spec([embedded_text_feature_column])
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
export_dir_base = self.cfg['model_path']
servable_model_path = estimator.export_savedmodel(export_dir_base, serving_input_receiver_fn)
# Example message for inference
message = "Was ist denn los"
saved_model_predictor = predictor.from_saved_model(export_dir=servable_model_path)
content_tf_list = tf.train.BytesList(value=[str.encode(message)])
example = tf.train.Example(
features=tf.train.Features(
feature={
'sentence': tf.train.Feature(
bytes_list=content_tf_list
)
}
)
)
with tf.python_io.TFRecordWriter('the_message.tfrecords') as writer:
writer.write(example.SerializeToString())
reader = tf.TFRecordReader()
data_path = 'the_message.tfrecords'
filename_queue = tf.train.string_input_producer([data_path], num_epochs=1)
_, serialized_example = reader.read(filename_queue)
output_dict = saved_model_predictor({'inputs': [serialized_example]})
And the output:
Traceback (most recent call last):
File "/Users/dimitrs/component-pythia/src/pythia.py", line 321, in _train
model = algo.generate_model(samples, generation_id)
File "/Users/dimitrs/component-pythia/src/algorithm_layer/algorithm.py", line 56, in generate_model
model = self._process_training(samples, generation)
File "/Users/dimitrs/component-pythia/src/algorithm_layer/tf_hub_classifier.py", line 91, in _process_training
output_dict = saved_model_predictor({'inputs': [serialized_example]})
File "/Users/dimitrs/anaconda3/envs/pythia/lib/python3.6/site-packages/tensorflow/contrib/predictor/predictor.py", line 77, in __call__
return self._session.run(fetches=self.fetch_tensors, feed_dict=feed_dict)
File "/Users/dimitrs/anaconda3/envs/pythia/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/Users/dimitrs/anaconda3/envs/pythia/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/Users/dimitrs/anaconda3/envs/pythia/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/Users/dimitrs/anaconda3/envs/pythia/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Unable to get element as bytes.
Isn't serialized_example the right input that is suggested by serving_input_receiver_fn ?
So, all I need was serialized_example = example.SerializeToString()
Writing the example on a file requires to start a session before reading it back. Simply serialising is enough:
# Example message for inference
message = "Was ist denn los"
saved_model_predictor = predictor.from_saved_model(export_dir=servable_model_path)
content_tf_list = tf.train.BytesList(value=[message.encode('utf-8')])
sentence = tf.train.Feature(bytes_list=content_tf_list)
sentence_dict = {'sentence': sentence}
features = tf.train.Features(feature=sentence_dict)
example = tf.train.Example(features=features)
serialized_example = example.SerializeToString()
output_dict = saved_model_predictor({'inputs': [serialized_example]})

InvalidArgumentError : ConcatOp : Dimensions of inputs should match

Tensorflow 1.7 when using dynamic_rnn.It runs fine at first , but at the 32th(it changes when i run the code) step , the error appears. When i used smaller batch , it seems the code can run longer , however the error still poped up .Just cannt figure out what's wrong.
from mapping import *
def my_input_fn(features, targets, batch_size=20, shuffle=True, num_epochs=None, sequece_lenth=None):
ds = tf.data.Dataset.from_tensor_slices(
(features, targets, sequece_lenth)) # warning: 2GB limit
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
features, labels, sequence = ds.make_one_shot_iterator().get_next()
return features, labels, sequence
def lstm_cell(lstm_size=50):
return tf.contrib.rnn.BasicLSTMCell(lstm_size)
class RnnModel:
def __init__(self,
batch_size,
hidden_units,
time_steps,
num_features
):
self.batch_size = batch_size
self.hidden_units = hidden_units
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[lstm_cell(i) for i in self.hidden_units])
self.initial_state = stacked_lstm.zero_state(batch_size, tf.float32)
self.model = stacked_lstm
self.state = self.initial_state
self.time_steps = time_steps
self.num_features = num_features
def loss_mean_squre(self, outputs, targets):
pos = tf.add(outputs, tf.ones(self.batch_size))
eve = tf.div(pos, 2)
error = tf.subtract(eve,
targets)
return tf.reduce_mean(tf.square(error))
def train(self,
num_steps,
learningRate,
input_fn,
inputs,
targets,
sequenceLenth):
periods = 10
step_per_periods = int(num_steps / periods)
input, target, sequence = input_fn(inputs, targets, self.batch_size, shuffle=True, sequece_lenth=sequenceLenth)
initial_state = self.model.zero_state(self.batch_size, tf.float32)
outputs, state = tf.nn.dynamic_rnn(self.model, input, initial_state=initial_state)
loss = self.loss_mean_squre(tf.reshape(outputs, [self.time_steps, self.batch_size])[-1], target)
optimizer = tf.train.AdamOptimizer(learning_rate=learningRate)
grads_and_vars = optimizer.compute_gradients(loss, self.model.variables)
optimizer.apply_gradients(grads_and_vars)
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
for i in range(num_steps):
sess.run(init_op)
state2, current_loss= sess.run([state, loss])
if i % step_per_periods == 0:
print("period " + str(int(i / step_per_periods)) + ":" + str(current_loss))
return self.model, self.state
def processFeature(df):
df = df.drop('class', 1)
features = []
for i in range(len(df["vecs"])):
features.append(df["vecs"][i])
aa = pd.Series(features).tolist() # tramsform into list
featuresList = []
for i in features:
p1 = []
for k in i:
p1.append(list(k))
featuresList.append(p1)
return featuresList
def processTargets(df):
selected_features = df[
"class"]
processed_features = selected_features.copy()
return tf.convert_to_tensor(processed_features.astype(float).tolist())
if __name__ == '__main__':
dividNumber = 30
"""
some code here to modify my data to input
it looks like this:
inputs before use input function : [fullLenth, charactorLenth, embeddinglenth]
"""
model = RnnModel(15, [100, 80, 80, 1], time_steps=dividNumber, num_features=25)
model.train(5000, 0.0001, my_input_fn, training_examples, training_targets, sequenceLenth=trainSequenceL)
And error is under here
Traceback (most recent call last):
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1330, in _do_call
return fn(*args)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1315, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1423, in _call_tf_sessionrun
status, run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [20,25] vs. shape[1] = [30,100]
[[Node: rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](rnn/while/TensorArrayReadV3, rnn/while/Switch_4:1, rnn/while/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/Const)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/programming/mlwords/dnn_gragh.py", line 198, in <module>
model.train(5000, 0.0001, my_input_fn, training_examples, training_targets, sequenceLenth=trainSequenceL)
File "D:/programming/mlwords/dnn_gragh.py", line 124, in train
state2, current_loss, nowAccuracy = sess.run([state, loss, accuracy])
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 908, in run
run_metadata_ptr)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1143, in _run
feed_dict_tensor, options, run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1324, in _do_run
run_metadata)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\client\session.py", line 1343, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [20,25] vs. shape[1] = [30,100]
[[Node: rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](rnn/while/TensorArrayReadV3, rnn/while/Switch_4:1, rnn/while/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/Const)]]
Caused by op 'rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat', defined at:
File "D:/programming/mlwords/dnn_gragh.py", line 198, in <module>
model.train(5000, 0.0001, my_input_fn, training_examples, training_targets, sequenceLenth=trainSequenceL)
File "D:/programming/mlwords/dnn_gragh.py", line 95, in train
outputs, state = tf.nn.dynamic_rnn(self.model, input, initial_state=initial_state)#,sequence_length=sequence
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 627, in dynamic_rnn
dtype=dtype)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 824, in _dynamic_rnn_loop
swap_memory=swap_memory)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 3205, in while_loop
result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2943, in BuildLoop
pred, body, original_loop_vars, loop_vars, shape_invariants)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2880, in _BuildLoop
body_result = body(*packed_vars_for_body)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 3181, in <lambda>
body = lambda i, lv: (i + 1, orig_body(*lv))
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 795, in _time_step
(output, new_state) = call_cell()
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn.py", line 781, in <lambda>
call_cell = lambda: cell(input_t, state)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 232, in __call__
return super(RNNCell, self).__call__(inputs, state)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\layers\base.py", line 714, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 1283, in call
cur_inp, new_state = cell(cur_inp, cur_state)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 339, in __call__
*args, **kwargs)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\layers\base.py", line 714, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 620, in call
array_ops.concat([inputs, h], 1), self._kernel)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\array_ops.py", line 1181, in concat
return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 1101, in concat_v2
"ConcatV2", values=values, axis=axis, name=name)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\ops.py", line 3309, in create_op
op_def=op_def)
File "D:\Anaconda3\envs\tensorflow-cpu\lib\site-packages\tensorflow\python\framework\ops.py", line 1669, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): ConcatOp : Dimensions of inputs should match: shape[0] = [20,25] vs. shape[1] = [30,100]
[[Node: rnn/while/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](rnn/while/TensorArrayReadV3, rnn/while/Switch_4:1, rnn/while/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/Const)]]
this is my code used to check my input
def checkData(inputs, targets, sequencelence):
batch_size = 20
features, target, sequece = my_input_fn(inputs, targets, batch_size=batch_size, shuffle=True, num_epochs=None,
sequece_lenth=sequencelence)
with tf.Session() as sess:
for i in range(1000):
features1, target1, sequece1 = sess.run([features, target, sequece])
assert len(features1) == batch_size
for sentence in features1 :
assert len(sentence) == 30
for word in sentence:
assert len(word) == 25
assert len(target1) == batch_size
assert len(sequece1) == batch_size
print(target1)
print("OK")
The error is coming from LSTMCell.call call method. There we are trying to tf.concat([inputs, h], 1) meaning that we want to concatenate the next input with the current hidden state before matmul'ing with the kernel variables matrix. The error is saying that you can't do it because the batch (0th) dimensions don't match up - your input is shaped [20,25] and your hidden state is shaped [30,100].
For some reason on your 32nd iteration, or whenever you see the error, the input is not batched to 30, but only to 20. This usually happens at the end of your training data when the total number of training examples does not evenly divide your batch size. This hypothesis is also consistent with "When i used smaller batch , it seems the code can run longer" statement.
I had the same issue. When I corrected the image input size to match the input shape, it ran without errors.

Error when reading data from TFRecord file using string_input_producer

I wrote a script to change MNIST data into TFRecord format:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
def _init64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
mnist = input_data.read_data_sets("/path/to/data", dtype=tf.uint8, one_hot=True)
images = mnist.train.images
labels = mnist.train.labels
num_examples = mnist.train.num_examples
num_shards = 10
instances_per_shard = int(num_examples / num_shards)
idx = 0
for i in range(num_shards):
filename = '/tmp/mnist/tfrecord-%.2d' % i
writer = tf.python_io.TFRecordWriter(filename)
for j in range(instances_per_shard):
example = tf.train.Example(features=tf.train.Features(feature={
'label': _bytes_feature(labels[idx].tostring()),
'image_raw': _bytes_feature(images[idx].tostring())
}))
writer.write(example.SerializeToString())
idx += 1
writer.close()
then read data from TFRecords files:
import tensorflow as tf
files = tf.train.match_filenames_once('/tmp/mnist/tfrecord-*')
filename_queue = tf.train.string_input_producer(files, shuffle=False)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.string)
}
)
image = tf.decode_raw(features['image_raw'], tf.uint8)
decode_image = tf.reshape(image, [28, 28, 1])
label = features['label']
#label = tf.decode_raw(features['label'], tf.uint8)
#label = tf.reshape(label, [10])
batch_size = 4
capacity = 1000 + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch([decode_image, label], batch_size=batch_size,
capacity=capacity, min_after_dequeue=30)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in range(4):
cur_example_batch, cur_label_batch = sess.run([example_batch, label_batch])
print(cur_label_batch)
coord.request_stop()
coord.join(threads)
It runs all very well. But if I uncomment these two lines:
label = tf.decode_raw(features['label'], tf.uint8)
label = tf.reshape(label, [10])
I get the following error:
Caused by op 'shuffle_batch', defined at:
File "/home/chenk/workspace/tflearn/Learning/create_batch.py", line 27, in <module>
capacity=capacity, min_after_dequeue=30)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py", line 1217, in shuffle_batch
name=name)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py", line 788, in _shuffle_batch
dequeued = queue.dequeue_many(batch_size, name=name)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/data_flow_ops.py", line 457, in dequeue_many
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 946, in _queue_dequeue_many_v2
timeout_ms=timeout_ms, name=name)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/opt/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
self._traceback = _extract_stack()
OutOfRangeError (see above for traceback): RandomShuffleQueue '_1_shuffle_batch/random_shuffle_queue' is closed and has insufficient elements (requested 4, current size 0)
[[Node: shuffle_batch = QueueDequeueManyV2[component_types=[DT_UINT8, DT_UINT8], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](shuffle_batch/random_shuffle_queue, shuffle_batch/n)]]
Is there something wrong in my code? what is the right way to do this?
Thanks!
The mnist images are in uint8 but the labels are of type float64. When you write the tfrecords as to_string(), each float64 value will be converted to 8 bytes. So when you are reading the tfrecords you should reading it as tf.float64. Reading it as uint8 will produce 80 labels, and the error is actually caused by the reshape() function.
label = tf.decode_raw(features['label'], tf.float64)
label = tf.reshape(label, [10])

Tensorflow batch training OutOfRangeError

Saving variables
Variables saved in 0.88 seconds
Saving metagraph
Metagraph saved in 35.81 seconds
Saving variables
Variables saved in 0.95 seconds
Saving metagraph
Metagraph saved in 33.20 seconds
Traceback (most recent call last):
Caused by op u'batch', defined at:
File "ava_train.py", line 155, in <module>
image_batch, label_batch = tf.train.batch([image, label], batch_size=batch_size, allow_smaller_final_batch=True)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/input.py", line 872, in batch
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/input.py", line 665, in _batch
dequeued = queue.dequeue_up_to(batch_size, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/data_flow_ops.py", line 510, in dequeue_up_to
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 1402, in _queue_dequeue_up_to_v2
timeout_ms=timeout_ms, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
self._traceback = _extract_stack()
OutOfRangeError (see above for traceback): FIFOQueue '_1_batch/fifo_queue' is closed and has insufficient elements (requested 100, current size 0)
[[Node: batch = QueueDequeueUpToV2[component_types=[DT_FLOAT, DT_INT32], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
my code is here
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
# process same as cifar10.distorted_inputs
log_dir = '../log'
model_dir = '../model'
max_num_epoch = 80
if not os.path.exists(log_dir):
os.makedirs(log_dir)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
num_train_example = len(os.listdir('../images/'))
# Reads pfathes of images together with their labels
image_list, label_list = read_labeled_image_list('../raw.txt')
images = ops.convert_to_tensor(image_list, dtype=dtypes.string)
labels = ops.convert_to_tensor(label_list, dtype=dtypes.int32)
# Makes an input queue
# input_queue = tf.train.slice_input_producer([images, labels], num_epochs=max_num_epoch, shuffle=True)
input_queue = tf.train.slice_input_producer([images, labels], shuffle=True)
image, label = read_images_from_disk(input_queue)
image_size = 240
keep_probability = 0.8
weight_decay = 5e-5
image = preprocess(image, image_size, image_size, None)
batch_size = 100
epoch_size = 1000
embedding_size = 128
# Optional Image and Label Batching
image_batch, label_batch = tf.train.batch([image, label], batch_size=batch_size, allow_smaller_final_batch=True)
This is the output of training an image classification model based on 20w images. I set allow_smaller_final_batch=True in batch. After some epochs the OutOfRangeError occured.
I don't know the reason and thanks for the help.
Since you get a OutOfRangeError it could be that you are training for more epochs than max_num_epochs, which will result in the slice_input_producer throwing this exception.
One possible workaround would be to remove the num_epochs=max_num_epochs from your slice_input_producer since this will allow it to produce even after the maximum number of epochs has been reached.
I have battled with this particular error for days. I finally found the cause. You are getting this error because your file is corrupted somewhere. Try running this code on another train and test data

MomentumOptimizer error: Attempting to use uninitialized value Variable_2/Momentum

I'm learning TensorFlow. I was trying tf.train.MomentumOptimizer but I got the following error:
Traceback (most recent call last):
File "relu.py", line 98, in <module>
learner.run(stop=0.01, print_epoch=True)
File "relu.py", line 70, in run
self.sess.run(train_step, feed_dict={self.x: batch_xs, self.y_: batch_ys})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.FailedPreconditionError: Attempting to use uninitialized value Variable_2/Momentum
[[Node: Momentum/update_Variable_2/ApplyMomentum = ApplyMomentum[T=DT_FLOAT, _class=["loc:#Variable_2"], use_locking=false, use_nesterov=false, _device="/job:localhost/replica:0/task:0/cpu:0"](Variable_2, Variable_2/Momentum, Momentum/learning_rate, gradients/add_1_grad/tuple/control_dependency_1, Momentum/momentum)]]
Caused by op u'Momentum/update_Variable_2/ApplyMomentum', defined at:
File "relu.py", line 98, in <module>
learner.run(stop=0.01, print_epoch=True)
File "relu.py", line 55, in run
train_step = self.optimizer.minimize(self.cross_entropy)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 289, in minimize
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 413, in apply_gradients
update_ops.append(processor.update_op(self, grad))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 61, in update_op
return optimizer._apply_dense(g, self._v) # pylint: disable=protected-access
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/momentum.py", line 69, in _apply_dense
use_nesterov=self._use_nesterov).op
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/gen_training_ops.py", line 348, in apply_momentum
use_nesterov=use_nesterov, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1226, in __init__
self._traceback = _extract_stack()
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value Variable_2/Momentum
[[Node: Momentum/update_Variable_2/ApplyMomentum = ApplyMomentum[T=DT_FLOAT, _class=["loc:#Variable_2"], use_locking=false, use_nesterov=false, _device="/job:localhost/replica:0/task:0/cpu:0"](Variable_2, Variable_2/Momentum, Momentum/learning_rate, gradients/add_1_grad/tuple/control_dependency_1, Momentum/momentum)]]
And following is my code:
import time
import numpy as np
import tensorflow as tf
import tensorflow.examples.tutorials.mnist.input_data as input_data
class ReluMnistNet:
def __init__(self, optimizer=None):
self.varlist = []
self.optimizer = optimizer or tf.train.GradientDescentOptimizer(0.01)
# fetch dataset
self.mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# prepare environment
layers = [ 100 ]
input_layer = 784
output_layer = 10
self.x = tf.placeholder(tf.float32, [None, input_layer])
last_layer = input_layer
y = self.x
for layer in layers:
b = tf.Variable(tf.zeros([layer]))
self.varlist.append(b)
W = tf.Variable(tf.random_normal([last_layer,layer], stddev=0.01))
self.varlist.append(W)
y = tf.nn.relu( tf.matmul(y,W) ) + b
last_layer = layer
b = tf.Variable(tf.zeros([output_layer]))
self.varlist.append(b)
W = tf.Variable(tf.random_normal([last_layer,output_layer], stddev=0.01))
self.varlist.append(W)
self.y = tf.matmul(y,W) + b
self.y_ = tf.placeholder(tf.float32, [None, 10])
self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.y, labels=self.y_) )
def prepare(self):
# init = tf.initialize_variables(self.varlist)
init = tf.initialize_all_variables()
self.sess = tf.Session()
self.sess.run(init)
def run(self, batch_size=100, stop=0.001, print_epoch=False):
mnist = self.mnist
data_size = mnist.train.images.shape[0]
last_accuracy = 0
accuracy_history = []
train_step = self.optimizer.minimize(self.cross_entropy)
time1 = time.time()
for i in range(10000):
for j in range(data_size/batch_size):
# random batch
batch_idx = np.arange(data_size)
np.random.shuffle(batch_idx)
batch_idx = batch_idx[0:batch_size]
batch_xs = mnist.train.images[batch_idx]
batch_ys = mnist.train.labels[batch_idx]
# ordered batch
# start = j * batch_size
# end = (j+1) * batch_size
# batch_xs, batch_ys = mnist.train.images[start:end], mnist.train.labels[start:end]
self.sess.run(train_step, feed_dict={self.x: batch_xs, self.y_: batch_ys})
# test the accuracy
correct_prediction = tf.equal( tf.argmax(self.y,1), tf.argmax(self.y_,1) )
accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32) )
accuracy = self.sess.run(accuracy, feed_dict = {self.x: mnist.test.images, self.y_: mnist.test.labels})
accuracy_history.append(accuracy)
if print_epoch:
print i, accuracy
if last_accuracy != 0 and abs(last_accuracy-accuracy) < stop:
break
last_accuracy = accuracy
time2 = time.time()
return accuracy_history, (time2-time1)
def close(self):
if not (self.sess is None):
self.sess.close()
self.sess = None
if __name__ == '__main__':
learner = ReluMnistNet()
# learner.optimizer = tf.train.GradientDescentOptimizer(0.01)
learner.optimizer = tf.train.MomentumOptimizer(0.01, momentum=0.9)
for i in range(10):
learner.prepare()
learner.run(stop=0.01, print_epoch=True)
learner.close()
It seems like a variable named Momentum is uninitialized? However, by calling learner.prepare(), I have called tf.initialize_all_variables(). Even more, I have no variable named Momentum. Why does this happens?
In your code you are calling minimize after initializing global variables
instead you have to do:
self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.y, labels=self.y_) )
self.optimize = self.optimizer.minimize(self.cross_entropy)
and in run function instead of
train_step = self.optimizer.minimize(self.cross_entropy)
you should call
train_step = self.optimize
P.S
Momentun is the default name for the MomentumOptimizer