deadlock is appearing when using tf.train.batch - tensorflow

the following code is aiming to load a pair of images with tensorflow. However, it is in deadlock. Especially, after I added the tf.train.batch part. If I get values before tf.train.batch, it works fine.
Could you point out which part is incorrect?
import tensorflow as tf
batch_size = 1
alist = [['a.jpg', 'b.jpg']] * 1000
logdir = './logdir'
NUM_THREADS = 5
with tf.Graph().as_default():
init = tf.constant(0, dtype=tf.int64)
global_step = tf.get_variable(name='global_step', trainable=False, initializer=init)
input_queue = tf.FIFOQueue(50, dtypes=[tf.string, tf.string], shapes=[[], []])
input_enqueue_op = input_queue.enqueue_many([alist[:, 0], alist[:, 1]])
input_dir, target_dir = input_queue.dequeue()
input_value = tf.read_file(input_dir)
input_img = tf.image.decode_jpeg(input_value, channels=3)
target_value = tf.read_file(target_dir)
target_img = tf.image.decode_jpeg(target_value, channels=3)
input_img = tf.image.resize_images(input_img, [224, 224])
input_img.set_shape((224, 224, 3))
input_img = tf.image.per_image_standardization(input_img)
target_img = tf.image.resize_images(target_img, [224, 224])
target_img.set_shape((224, 224, 3))
target_img = tf.image.per_image_standardization(target_img)
img_batch, gt_img_batch = tf.train.batch(
[input_img, target_img],
batch_size = 1,
num_threads = 1,
# shapes= [input_img.get_shape(), target_img.get_shape()],
capacity = 30,
enqueue_many=False,
allow_smaller_final_batch=True,
name='input_batch')
qr = tf.train.QueueRunner(input_queue, [input_enqueue_op] * NUM_THREADS)
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = qr.create_threads(sess, coord=coord, start=True)
for i in range(10):
a = sess.run(img_batch)
print(a.shape)
# Wait for threads to finish.
coord.request_stop()
coord.join(threads)

tf.train.batch creates own queue runners:
This function is implemented using a queue. A QueueRunner for the queue is added to the current Graph's QUEUE_RUNNER collection.
They also need to be started. TensoFlow has a function that starts all queue runners collected in the graph: tf.train.start_queue_runners.
Also it makes sense to add your queue runner to the corresponding collection using tf.train.add_queue_runner. This way start_queue_runners will also start your queue runner.

Related

Feeding Dataset Iterator to Tensorflow

Can i get a full example somewhere where they feed tf.data.Dataset iterator to a model? I'm trying to feed this data into a model without the help of tf.Estimators.
def preprocess_image(image):
image = tf.image.decode_jpeg(image, channels=1)
image = tf.image.resize_images(image, [224, 224])
image = tf.image.random_flip_left_right(image)
image /= 255.0
image = tf.cast(image, tf.float32)
image = tf.train.shuffle_batch([image],batch_size=16, num_threads=10, capacity=100000, min_after_dequeue=15)
return image
def load_and_preprocess_image(path):
image = tf.read_file(path)
return preprocess_image(image)
train_data_dx = tf.data.Dataset.from_tensor_slices(xray_data_train['full_path'].values)
train_data_dx = train_data_dx.map(load_and_preprocess_image, num_parallel_calls=8)
train_data_dy = xray_data_train['Finding_strings']
print(train_data_dx.output_shapes)
print(train_data_dx.output_types)
test_data_dx = tf.data.Dataset.from_tensor_slices(xray_data_test['full_path'].values)
test_data_dx = test_data_dx.map(load_and_preprocess_image, num_parallel_calls=8)
test_data_dy = xray_data_test['Finding_strings']
Here's a full example.
Note
Iterator must be initialized at the beginning
We can set number of epochs to perform by using repeat() method of number of epochs and batch() method for batch size. Note that I use first repeat() and then batch().
At each iteration we're using tf.Session() interface to access the next batch.
We use try-except since when repetition of data ends it raises tf.error.OutOfRangeError.
import tensorflow as tf
from sklearn.datasets import make_blobs
# generate dummy data for illustration
x_train, y_train = make_blobs(n_samples=25,
n_features=2,
centers=[[1, 1], [-1, -1]],
cluster_std=0.5)
n_epochs = 2
batch_size = 10
with tf.name_scope('inputs'):
x = tf.placeholder(tf.float32, shape=[None, 2])
y = tf.placeholder(tf.int32, shape=[None])
with tf.name_scope('logits'):
logits = tf.layers.dense(x,
units=2,
name='logits')
with tf.name_scope('loss'):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss_tensor = tf.reduce_mean(xentropy)
with tf.name_scope('optimizer'):
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss_tensor)
# create dataset `from_tensor_slices` and create iterator
dataset = tf.data.Dataset.from_tensor_slices({'x':x_train, 'y':y_train})
dataset = dataset.repeat(n_epochs).batch(10)
iterator = dataset.make_initializable_iterator()
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(),
iterator.initializer]) # <-- must be initialized!
next_batch = iterator.get_next()
while True:
try:
batch = sess.run(next_batch) # <-- extract next batch
loss_val, _ = sess.run([loss_tensor, train_op],
feed_dict={x:batch['x'], y:batch['y']})
print(loss_val)
except tf.errors.OutOfRangeError:
break

Create predictions with custom CNN using tfrecord input

My aim is to classify images into ten categories. I have a tfrecord file as input. You can download it here (30 MB). My modified the code according to the answer:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
def my_cnn(images, num_classes, is_training): # is_training is not used...
with slim.arg_scope([slim.max_pool2d], kernel_size=[3, 3], stride=2):
net = slim.conv2d(images, 64, [5, 5])
net = slim.max_pool2d(net)
net = slim.conv2d(net, 64, [5, 5])
net = slim.max_pool2d(net)
net = slim.flatten(net)
net = slim.fully_connected(net, 192)
net = slim.fully_connected(net, num_classes, activation_fn=None)
return net
data_path = 'train-some.tfrecords'
with tf.Graph().as_default():
batch_size, height, width, channels = 10, 224, 224, 3
feature = {'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64)}
filename_queue = tf.train.string_input_producer([data_path], num_epochs=1)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example, features=feature)
image = tf.decode_raw(features['train/image'], tf.float32)
label = tf.cast(features['train/label'], tf.int32)
image = tf.reshape(image, [224, 224, 3])
images, labels = tf.train.shuffle_batch([image, label], batch_size, capacity=30, num_threads=1, min_after_dequeue=10)
num_classes = 10
logits = my_cnn(images, num_classes, is_training=True)
probabilities = tf.nn.softmax(logits)
with tf.Session() as sess:
init_op = [tf.global_variables_initializer(), tf.local_variables_initializer()]
# Run the init_op, evaluate the model outputs and print the results:
sess.run(init_op)
#probabilities = sess.run(probabilities)
# Create a coordinator, launch the queue runner threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
while not coord.should_stop():
while True:
prob = sess.run(probabilities)
print('Probabilities Shape:')
print(prob.shape)
except tf.errors.OutOfRangeError:
# When done, ask the threads to stop.
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
# Save the model
saver = tf.train.Saver()
saver.save(sess, './slim_model/custom_model')
Unfortunately, I still have error messages:
ValueError: Tensor Tensor("Softmax:0", shape=(10, 10), dtype=float32) is not an element of this graph.
ValueError: Fetch argument cannot be interpreted as a Tensor. (Tensor Tensor("Softmax:0", shape=(10, 10), dtype=float32) is not an element of this graph.)
The issue is with your training. You need to start the queues using tf.train.start_queue_runners that will run a few threads to process and enqueue examples. Create a Coordinator and ask the queue runner to start its threads with the coordinator.
Check the code changes:
with tf.Session() as sess:
init_op = [tf.global_variables_initializer(), tf.local_variables_initializer()]
# Run the init_op, evaluate the model outputs and print the results:
sess.run(init_op)
#probabilities = sess.run(probabilities)
# Create a coordinator, launch the queue runner threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
while not coord.should_stop():
while True:
prob = sess.run(probabilities)
print('Probabilities Shape:')
print(prob.shape)
except tf.errors.OutOfRangeError:
# When done, ask the threads to stop.
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
# Save the model
saver = tf.train.Saver()
saver.save(sess, './slim_model/custom_model'
Output:
Probabilities Shape:
(10, 10)
Probabilities Shape:
(10, 10)
Probabilities Shape:
(10, 10)
Probabilities Shape:
(10, 10)
Probabilities Shape:
(10, 10)
Done training -- epoch limit reached
Code with the above fixes along with saving and restoring the model can be downloaded from here.

Something went wrong with the input pipeline in tensorflow

I am trying to get a batch of 64 images each has [64,224,224,3] dimensions and labels have [64]. There are 8126 _img_class and _img_names. However, I am getting an unexpected output. Basically, I am getting nothing and script never terminate when I run it.
def _get_images(shuffle=True):
"""Gets the images and labels as a batch"""
#get image and label list
_img_names,_img_class = _get_list()
filename_queue = tf.train.string_input_producer(_img_names)
#reader
image_reader = tf.WholeFileReader()
_, image_file = image_reader.read(filename_queue)
#decode jpeg
image_original = tf.image.decode_jpeg(image_file)
label_original = tf.convert_to_tensor(_img_class,dtype=tf.int32)
#image preprocessing
image = tf.image.resize_images(image_original, [224,224])
float_image = tf.cast(image,dtype=tf.float32)
float_image = tf.image.per_image_standardization(image)
#set the shape
float_image.set_shape((224, 224, 3))
label_original.set_shape([8126])
#parameters for shuffle
batch_size = 64
num_preprocess_threads = 16
num_examples_per_epoch = 8000
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(num_examples_per_epoch *
min_fraction_of_examples_in_queue)
if shuffle:
images_batch, label_batch = tf.train.shuffle_batch(
[float_image,label_original],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + 3 *
batch_size,
min_after_dequeue=min_queue_examples)
else:
images_batch, label_batch = tf.train.batch(
[float_image,label_original],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + 3 * batch_size)
return images_batch,label_batch
with tf.Session() as sess:
tf.global_variables_initializer().run()
# Coordinate the loading of image files.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
images,labels = _get_images(shuffle=True)
# Get an image tensor and print its value.
image_tensor,labels = sess.run([images,labels])
# Finish off the filename queue coordinator.
coord.request_stop()
coord.join(threads)
When I set enqueue_many=True I am getting the following error.
TypeError: 'Tensor' object is not iterable.
You need to start the queue_runners after calling the _get_images function. As queue is defined in that function.
...
images,labels = _get_images(shuffle=True)
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
# Coordinate the loading of image files.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
image_tensor,labels = sess.run([images,labels])

TensorFlow: No decrease in CTC loss while training BLSTM

I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)
After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.
The problem is that you are feeding raw images in the LSTM, so it is very difficult for it to extract any useful information. The CRNN paper first uses a series of convolutional layers to extract features from the images, and then these are fed into the LSTM.

Cannot load int variable from previous session in tensorflow 1.1

I have read many similar questions and just cannot get this to work properly.
I have my model being trained well and checkpoint files are being made every epoch. I want to have it so the program can continue from epoch x once reloaded and also for it to print that is on that epoch with every iteration. I could simply save the data outside of the checkpoint file, however I was also wanting to do this to give me confidence everything else is also being stored properly.
Unfortunately the value in the epoch/global_step variable is always still 0 when I restart.
import tensorflow as tf
import numpy as np
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(init_op, sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
sess.run(init_op)
return
with tf.Graph().as_default() as g:
# build models
total_batch = data.train.num_examples / batch_size
epochLimit = 51
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
restore(init_op, sess, saver)
epoch = global_step.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)
I also update the global step variable using assign at the bottom. Any ideas? Any help would be greatly appreciated.
When you call sess.run(init_op) after restoring this resets all variables to their initial values. Comment that line out and things should work.
My original code was wrong for several reasons because I was trying so many things. The first responder Alexandre Passos gives a valid point, but I believe what changed the game was also the use of scopes (maybe?).
Below is the working updated code if it helps anyone:
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
return saver, True, sess
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess.run(init_op)
return saver, False , sess
batch_size = 100
learning_rate = 0.0001
beta1 = 0.5
z_size = 100
save_interval = 1
data = dataset.read()
total_batch = data.train.num_examples / batch_size
def fill_queue():
for i in range(int(total_batch*epochLimit)):
sess.run(enqueue_op, feed_dict={batch: data.train.next_batch(batch_size)}) # runnig in seperate thread to feed a FIFOqueue
with tf.variable_scope("glob"):
global_step = tf.get_variable(name='global_step', initializer=0,trainable=False)
# build models
epochLimit = 51
saver = tf.train.Saver()
with tf.Session() as sess:
saver,rstr,sess = restore(sess, saver)
with tf.variable_scope("glob", reuse=True):
epocht = tf.get_variable(name='global_step', trainable=False, dtype=tf.int32)
epoch = epocht.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)