My tensorboard events appear many charts I did not summary - tensorflow

I only summary my loss as 'xentropy_mean' in training() ,but in tensorboard ,I had not find the 'xentropy_mean' chart but many other charts I did not defined. I don't know where I wrote wrong, and what's the matter indeed. Is it because I use thread in my code? If I don't use thread, how should I wrote it?
The tensorboard screenshot
There are 6 charts under the queue,I don't know what are the meanings either
I create the model in the file below
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow.python.platform
import tensorflow as tf
# The MNIST dataset has 10 classes, representing the digits 0 through 9.
NUM_CLASSES = 16
# The MNIST images are always 28x28 pixels.
IMAGE_SIZE = 28
IMAGE_PIXELS = 784
def inference(images, hidden1_units, hidden2_units):
"""Build the MNIST model up to where it may be used for inference.
Args:
images: Images placeholder, from inputs().
hidden1_units: Size of the first hidden layer.
hidden2_units: Size of the second hidden layer.
Returns:
softmax_linear: Output tensor with the computed logits.
"""
# Hidden 1
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
name='weights')
biases = tf.Variable(tf.zeros([hidden1_units]),
name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
# Hidden 2
with tf.name_scope('hidden2'):
weights = tf.Variable(
tf.truncated_normal([hidden1_units, hidden2_units],
stddev=1.0 / math.sqrt(float(hidden1_units))),
name='weights')
biases = tf.Variable(tf.zeros([hidden2_units]),
name='biases')
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
# Linear
with tf.name_scope('softmax_linear'):
weights = tf.Variable(
tf.truncated_normal([hidden2_units, NUM_CLASSES],
stddev=1.0 / math.sqrt(float(hidden2_units))),
name='weights')
biases = tf.Variable(tf.zeros([NUM_CLASSES]),
name='biases')
logits = tf.matmul(hidden2, weights) + biases
return logits
def loss(logits, labels):
batch_size = tf.size(labels)
#print('batch size %d' %(batch_size))
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size), 1)
concated = tf.concat(1, [indices, labels])
#print('Done2')
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([batch_size, 16]), 1.0, 0.0)
#print('Done1')
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
onehot_labels,
name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
tf.summary.scalar(loss.op.name, loss)
return loss
def training(loss, learning_rate):
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
global_step=tf.Variable(0,name='global_step',trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
def evaluation(logits, labels):
correct = tf.nn.in_top_k(logits, labels, 1)
# Return the number of true entries.
return tf.reduce_sum(tf.cast(correct, tf.int32))
and train the model in this file:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import time
import numpy as np
import tensorflow as tf
import mnist
# Basic model parameters as external flags.
#FLAGS = None
# Constants used for dealing with the files, matches convert_to_records.
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'
TEST_FILE='test.tfrecords'
flags = tf.app.flags
FLAGS = flags.FLAGS
#FLAGS = None
flags.DEFINE_string('train_dir', '/home/queenie/image2tfrecord/tfrecords-28-gray/', 'Directory to put the training data.')
flags.DEFINE_string('filename', 'train.tfrecords', 'Directory to put the training data.')
flags.DEFINE_integer('batch_size', 100, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('num_epochs', None, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('hidden1', 128,'balabala')
flags.DEFINE_integer('hidden2', 32,'balabala')
flags.DEFINE_integer('learning_rate', 0.01,'balabala')
flags.DEFINE_integer('max_steps', 50000,'balabala')
def placeholder_inputs(batch_size):
images_placeholder=tf.placeholder(tf.float32,shape=(batch_size,mnist.IMAGE_PIXELS))
labels_placeholder=tf.placeholder(tf.int32,shape=(batch_size))
return images_placeholder,labels_placeholder
def fill_feed_dict(images_feed,labels_feed,images_pl,labels_pl):
feed_dict={
images_pl:images_feed,
labels_pl:labels_feed,
}
return feed_dict
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
# Convert from a scalar string tensor (whose single string has
# length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
# [mnist.IMAGE_PIXELS].
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([mnist.IMAGE_PIXELS])
# OPTIONAL: Could reshape into a 28x28 image and apply distortions
# here. Since we are not applying any distortions in this
# example, and the next step expects the image to be flattened
# into a vector, we don't bother.
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
# Convert label from a scalar uint8 tensor to an int32 scalar.
label = tf.cast(features['label'], tf.int32)
return image, label
def do_eval(sess,eval_correct):
true_count=0
for step in xrange(FLAGS.batch_size):
#print(sess.run(eval_correct))
true_count+=sess.run(eval_correct)
precision=float(true_count)/FLAGS.batch_size/FLAGS.batch_size
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' %
(FLAGS.batch_size, true_count, precision))
return precision
def inputs(train, batch_size, num_epochs):
if not num_epochs: num_epochs = None
if train=='train':
filename=os.path.join(FLAGS.train_dir,TRAIN_FILE)
elif train=='validation':
filename=os.path.join(FLAGS.train_dir,VALIDATION_FILE)
else:
filename=os.path.join(FLAGS.train_dir,TEST_FILE)
# filename = os.path.join(FLAGS.train_dir,
# TRAIN_FILE if train else VALIDATION_FILE)
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=None)
# Even when reading in multiple threads, share the filename
# queue.
image, label = read_and_decode(filename_queue)
# Shuffle the examples and collect them into batch_size batches.
# (Internally uses a RandomShuffleQueue.)
# We run this in two threads to avoid being a bottleneck.
images, sparse_labels = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=1000)
return images, sparse_labels
def run_training():
with tf.Graph().as_default():
# Build a Graph that computes predictions from the inference model.
images, labels = inputs(train='train', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_valid,labels_valid=inputs(train='validation', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_test,labels_test=inputs(train='test', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
logits = mnist.inference(images,
FLAGS.hidden1,
FLAGS.hidden2)
# Add to the Graph the loss calculation.
valid_prediction=mnist.inference(images_valid,FLAGS.hidden1,FLAGS.hidden2)
test_prediction=mnist.inference(images_test,FLAGS.hidden1,FLAGS.hidden2)
loss = mnist.loss(logits, labels)
# Add to the Graph operations that train the model.
train_op = mnist.training(loss, FLAGS.learning_rate)
eval_correct=mnist.evaluation(logits,labels)
eval_correct_valid=mnist.evaluation(valid_prediction,labels_valid)
eval_correct_test=mnist.evaluation(test_prediction,labels_test)
summary_op=tf.merge_all_summaries()
# The op for initializing the variables.
init_op = tf.group(tf.initialize_all_variables(),
tf.initialize_local_variables())
saver = tf.train.Saver()
# Create a session for running operations in the Graph.
sess = tf.Session()
# Initialize the variables (the trained variables and the
# epoch counter).
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
train_precision=0
validation_precision=0
test_precision=0
#while not coord.should_stop():
while not coord.should_stop():
start_time = time.time()
_, loss_value,images_see,labels_see = sess.run([train_op, loss,images,labels])
#print('run done')
duration = time.time() - start_time
# Print an overview fairly often.
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
duration))
precision_tr=do_eval(sess,eval_correct)
summary_str=sess.run(summary_op)
summary_writer.add_summary(summary_str,step)
if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
print('Train:')
do_eval(sess,eval_correct)
print('Validation:')
do_eval(sess,eval_correct_valid)
print('Test:')
do_eval(sess,eval_correct_test)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
sess.close()
run_training()
then I get the tensorboard like these,6 charts about queue.
The tensorboard screenshot

The queue charts you are seeing are created by default from shuffle_batch and friends, and can be used to monitor the performance of your input pipeline (you'll ideally want all the queues to stay at capacity, as that means your GPU isn't blocking on input reading).
I don't understand why your summary isn't showing in tensorboard. Can I get more information?

Related

L2 regularization keep increasing during training

I am finetuning InceptionResnetV2 on TensorFlow. When training, the regularization loss keep linearly increasing and even much larger than cross entropy loss in the later stage of training. I have checked the training procedure, and make sure I am optimizing the cross entropy loss and L2 loss combined.
Is there anyone explain this weird thing a little bit? Any feedback is appreciated.
Here is the code and some TensorBoard plots.
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
from preprocessing import aug_parallel_v2
import numpy as np
slim = tf.contrib.slim
# total training data number
sample_num = 625020
data_path = 'iNaturalist_train.tfrecords'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log_v5'
# tensorboard visualization path
filewriter_path = './filewriter_v5_Logits'
# State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
checkpoint_save_addr = './log_v5/fine-tuning_v5.ckpt'
# State the image size you're resizing your images to. We will use the default inception size of 299.
image_size = 299
# State the number of classes to predict:
num_classes = 8142
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 5
# State your batch size
batch_size = 60
# Learning rate information and configuration
initial_learning_rate = 0.0005
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 2
# put weight on different classes inversely proportional
# to total number of their image samples
label_count = np.loadtxt('label_count.txt', dtype=int)
inverse = lambda t: 1 / t
vfunc = np.vectorize(inverse)
multiplier = vfunc(label_count)
multiplier /= np.mean(multiplier)
def run():
if not os.path.exists(log_dir):
os.mkdir(log_dir)
feature = {'train/height': tf.FixedLenFeature([], tf.int64),
'train/width': tf.FixedLenFeature([], tf.int64),
'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64),
'train/sup_label': tf.FixedLenFeature([], tf.int64),
'train/aug_level': tf.FixedLenFeature([], tf.int64)}
# create a list of file names
filename_queue = tf.train.string_input_producer([data_path], num_epochs=None)
print(filename_queue)
reader = tf.TFRecordReader()
_, tfrecord_serialized = reader.read(filename_queue)
features = tf.parse_single_example(tfrecord_serialized, features=feature)
# Convert the image data from string back to the numbers
height = tf.cast(features['train/height'], tf.int64)
width = tf.cast(features['train/width'], tf.int64)
# change this line for your TFrecord version
tf_image = tf.image.decode_jpeg(features['train/image'])
tf_label = tf.cast(features['train/label'], tf.int32)
aug_level = tf.cast(features['train/aug_level'], tf.int32)
# tf_sup_label = tf.cast(features['train/sup_label'], tf.int64)
tf_image = tf.reshape(tf_image, tf.stack([height, width, 3]))
tf_label = tf.reshape(tf_label, [1])
aug_level = tf.reshape(aug_level, [1])
resized_image = tf.image.resize_images(images=tf_image, size=tf.constant([400, 400]), method=2)
resized_image = tf.cast(resized_image, tf.uint8)
tf_images, tf_labels, tf_aug = tf.train.shuffle_batch([resized_image, tf_label, aug_level], batch_size=batch_size,
capacity=2048, num_threads=16, allow_smaller_final_batch=False,
min_after_dequeue=256)
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
IMAGE_HEIGHT = 299
IMAGE_WIDTH = 299
images = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3])
labels = tf.placeholder(dtype=tf.int32, shape=[None, 1])
weighted_level = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(sample_num / batch_size)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(images, num_classes=num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
print("label test")
print(labels)
print(logits)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = tf.squeeze(tf.one_hot(labels, num_classes), [1])
print(one_hot_labels)
print(logits)
weighted_onehot = tf.multiply(one_hot_labels, weighted_level)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
digits_loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_onehot, logits=logits)
reg_loss = tf.losses.get_regularization_loss()
total_loss = digits_loss + reg_loss
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# train_vars = []
# Now we can define the optimizer that takes on the learning rate
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
"InceptionResnetV2/Logits")
# RMSProp or Adam
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer, variables_to_train=train_vars)
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.metrics.accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
tf.summary.scalar('losses/Reg_Loss', reg_loss)
tf.summary.scalar('losses/Digit_Loss', digits_loss)
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
writer = tf.summary.FileWriter(filewriter_path)
writer.add_graph(tf.get_default_graph())
my_summary_op = tf.summary.merge_all()
def train_step(sess, train_op, global_step, imgs, lbls, weight):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed
for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: digit_loss: %.4f (%.2f sec/step)',
global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
saver_pretrain = tf.train.Saver(variables_to_restore)
saver_train = tf.train.Saver(train_vars)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
# Create a coordinator and run all QueueRunner objects
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
saver_pretrain.restore(sess, checkpoint_file)
start_time = time.time()
for step in range(int(num_steps_per_epoch * num_epochs)):
imgs, lbls, augs = sess.run([tf_images, tf_labels, tf_aug])
imgs, lbls = aug_parallel_v2(imgs, lbls, augs)
imgs = imgs[:, 50:349, 50:349, :]
imgs = 2*(imgs.astype(np.float32)) - 1
lbls = lbls.astype(np.int32)
weight = multiplier[lbls]
weight = np.array(weight).reshape((batch_size, 1))
# print(imgs[0, 0:10, 0:10, 0:2])
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
print('logits: \n', logits_value)
print('Probabilities: \n', probabilities_value)
print('predictions: \n', predictions_value)
print('Labels:\n:', labels_value)
# Log the summaries every 10 step.
if step % 20 == 0:
loss, global_step_count = train_step(sess, train_op, global_step, imgs, lbls, weight)
summaries = sess.run(my_summary_op, feed_dict={images: imgs, labels: lbls, weighted_level: weight})
writer.add_summary(summaries, global_step_count)
# sess.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, global_step, imgs, lbls, weight)
if step % 2000 == 0:
logging.info('Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
print('one batch time: ', time.time() - start_time)
start_time = time.time()
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
# Stop the threads
coord.request_stop()
# Wait for threads to stop
coord.join(threads)
sess.close()
if __name__ == '__main__':
run()
I am new here, and don't have enough reputation to post images.
Here are two links for the accuracy plot and losses plot. You can easily tell the regularization loss is in a dominant position.
This is a difficult question to answer. I can give some pointers though.
In general, when you try to minimize digits_loss, that is to fit your model to your data, you will slowly change the weights in your layers. To counter potential overfitting, a L2 regularization loss (the sum of the squares of all weights, reg_loss in your code) is generally added to the overall loss (total_loss in your code.) These two forces generally act against each other and if the balance is right, you train a good model.
In your case you're taking a network (resnet_v2) that was developed for 1,001 classes and try to predict 8,142 classes. No problem with that per se, but you're upsetting the balance. So I believe you need to override the default weight decay of 0.00004 for resnet v2 to some higher value, in this line (note only 3 zeros in the decimals for a 10x increase):
with slim.arg_scope( inception_resnet_v2_arg_scope( weight_decay = 0.0004 ) ):
A higher weight_decay parameter will force the L2 loss to decrease faster. The problem is that this number is just a guess, I have no idea what an ideal value would be. You need to experiment with multiple values and figure it out.

Dataset input from bmp images only 50% accurate

I've created this graph to try:
Import BMP files and generate label based on their filename (L/R).
Train a network to determine between the left and right eye.
Evaluate the network.
I'm using the new framework and get it all in as a dataset. The code runs, but I only get 50% accuracy (no learning happening).
Can anyone check that the graph is right and it's just my network I need to fix ?
""" Routine for processing Eye Image dataset
determines left/right eye
Using Tensorflow API v1.3
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import fnmatch
import tensorflow as tf
from six.moves import xrange # pylint: disable=redefined-builtin
import nnLayers as nnLayer
IMAGE_SIZE = 460
SCALE_SIZE = 100
NUM_CLASSES = 2
IMAGE_DEPTH = 3
FLAGS = tf.app.flags.FLAGS
# Basic model parameters.
tf.app.flags.DEFINE_integer('batch_size', 200,
"""Number of images to process in a batch.""")
tf.app.flags.DEFINE_integer('num_epochs', 1001,
"""Number of images to process in a batch.""")
tf.app.flags.DEFINE_string('train_directory', './eyeImages',
"""directory of images to process.""")
tf.app.flags.DEFINE_string('test_directory', './eyeTest',
"""directory of images to process.""")
tf.app.flags.DEFINE_string('log_dir', './logs',
"""logging directory""")
def _parse_function(filename, label):
"""Takes filenames and labels and returns
one hot labels and image values"""
#read the file
image_string = tf.read_file(filename)
#decode BMP file
image_decoded = tf.image.decode_bmp(image_string)
#resize accordingly
image = tf.image.resize_images(image_decoded, [SCALE_SIZE, SCALE_SIZE])
#convert label to one hot
one_hot = tf.one_hot(label, NUM_CLASSES)
return image, one_hot
def inference(image):
#shape image for convolution
with tf.name_scope('input_reshape'):
x_image = tf.reshape(image, [-1, SCALE_SIZE, SCALE_SIZE, IMAGE_DEPTH]) #infer number of images, last dimension is features
tf.summary.image('input_images',x_image)
#neural net layers
#100x100x3 -> 50x50x32
h_pool1 = nnLayer.conv_layer(x_image, IMAGE_DEPTH, 5, 32, 'hiddenLayer1', act=tf.nn.relu)
#50x50x32 -> 25x25x64
h_pool2 = nnLayer.conv_layer(h_pool1, 32, 5, 64, 'hiddenLayer2', act=tf.nn.relu)
#25x25x64 -> 1024x2
h_fc1 = nnLayer.fc_layer(h_pool2, 64, 25, 1024, 'fcLayer1', act=tf.nn.relu)
#1024x2 ->1x2
with tf.name_scope('final-layer'):
with tf.name_scope('weights'):
W_fc2 = nnLayer.weight_variable([1024,NUM_CLASSES])
with tf.name_scope('biases'):
b_fc2 = nnLayer.bias_variable([NUM_CLASSES])
y_conv = tf.matmul(h_fc1, W_fc2) + b_fc2
return y_conv
def folderParser(folder):
"""output BMP file names in directory and
label based on file name"""
#create list of filenames in directory
files = os.listdir(folder)
#filter for BMP files
bmpfiles = fnmatch.filter(files, '*.bmp')
#create empty lists
labels = []
fullNames = []
#get the length of the filename and determine left/right label
for i in range(len(bmpfiles)):
length = len(bmpfiles[i])
fullNames.append(folder + '/' + bmpfiles[i])
if (bmpfiles[i][length-17])=='L':
labels.append(1)
else:
labels.append(0)
return fullNames,labels
def main(argv=None): # pylint: disable=unused-argument
#delete the log files if present
#if tf.gfile.Exists(FLAGS.log_dir):
# tf.gfile.DeleteRecursively(FLAGS.log_dir)
#tf.gfile.MakeDirs(FLAGS.log_dir)
#get file names and labels
trainNames, trainLabels = folderParser(FLAGS.train_directory)
testNames, testLabels = folderParser(FLAGS.test_directory)
# create a dataset of the file names and labels
tr_data = tf.contrib.data.Dataset.from_tensor_slices((trainNames, trainLabels))
ts_data = tf.contrib.data.Dataset.from_tensor_slices((testNames, testLabels))
#map the data set from file names to images
tr_data = tr_data.map(_parse_function)
ts_data = ts_data.map(_parse_function)
#shuffle the images
tr_data = tr_data.shuffle(FLAGS.batch_size*2)
ts_data = ts_data.shuffle(FLAGS.batch_size*2)
#create batches
tr_data = tr_data.batch(FLAGS.batch_size)
ts_data = ts_data.batch(FLAGS.batch_size)
#create handle for datasets
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.contrib.data.Iterator.from_string_handle(handle, tr_data.output_types, tr_data.output_shapes)
next_element = iterator.get_next()
#setup iterator
training_iterator = tr_data.make_initializable_iterator()
validation_iterator = ts_data.make_initializable_iterator()
#retrieve next batch
features, labels = iterator.get_next()
#run network
y_conv = inference(features)
#determine softmax and loss function
with tf.variable_scope('softmax_linear') as scope:
diff = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=y_conv)
with tf.name_scope('total'):
cross_entropy = tf.reduce_mean(diff)
tf.summary.scalar('cross_entropy', cross_entropy)
#run gradient descent
with tf.name_scope('train'):
training_op = tf.train.GradientDescentOptimizer(1e-3).minimize(cross_entropy)
#identify correct predictions
with tf.name_scope('correct_prediction'):
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(labels, 1))
#find the accuracy of the model
with tf.name_scope('accuracy'):
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', accuracy)
with tf.Session() as sess:
#initialization of the variables
training_handle = sess.run(training_iterator.string_handle())
validation_handle = sess.run(validation_iterator.string_handle())
sess.run(tf.global_variables_initializer())
#merge all the summaries and write test summaries
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
#run through epochs
for epoch in range(FLAGS.num_epochs):
#initialize the training set for training epoch
sess.run(training_iterator.initializer)
if epoch % 2 ==0:
#initialize validation set
sess.run(validation_iterator.initializer)
#test
summary, acc = sess.run([merged, accuracy], feed_dict={handle: validation_handle})
train_writer.add_summary(summary, epoch) #write to test file
print('step %s, accuracy %s' % (epoch, acc))
else:
#train
sess.run(training_op, feed_dict={handle: training_handle})
#close the log files
train_writer.close()
test_writer.close()
if __name__ == '__main__':
tf.app.run()
Aaron
The answer was image standardization:
image_std = tf.image.per_image_standardization (image_resized)
Without the image standardization the neurons were becoming saturated. Improved the outcome straight away.
Thanks.

TensorFlow: No decrease in CTC loss while training BLSTM

I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)
After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.
The problem is that you are feeding raw images in the LSTM, so it is very difficult for it to extract any useful information. The CRNN paper first uses a series of convolutional layers to extract features from the images, and then these are fed into the LSTM.

Ordering from tensorflow queues

I am trying to understand queues in better detail. Using the code below I expect that since I am not shuffling the alphabetic list the output collection will be in alphabetic order. This seems to be the case for all but the initial epoch. Am I misunderstanding something?
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import tensorflow as tf
import numpy as np
import string
# Basic model parameters as external flags.
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('num_epochs', 2, 'Number of epochs to run trainer.')
flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_integer('batch_size', 100, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_string('train_dir', '/tmp/data',
'Directory to put the training data.')
flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
'for unit testing.')
def run_training():
# Tell TensorFlow that the model will be built into the default Graph.
with tf.Graph().as_default():
with tf.name_scope('input'):
# Input data
images_initializer = tf.placeholder(
dtype=tf.int64,
shape=[52,1])
input_images = tf.Variable(
images_initializer, trainable=False, collections=[])
image = tf.train.slice_input_producer(
[input_images], num_epochs=2)
images = tf.train.batch(
[image], batch_size=1)
alph_initializer = tf.placeholder(
dtype=tf.string,
shape=[26,1])
input_alph = tf.Variable(
alph_initializer, trainable=False, collections=[])
alph = tf.train.slice_input_producer(
[input_alph], shuffle=False, capacity=26)
alphs = tf.train.batch(
[alph], batch_size=1)
my_list = np.array(list(range(0,52))).reshape(52,1)
my_list_val = np.array(list(string.ascii_lowercase)).reshape(26,1)
# Create the op for initializing variables.
init_op = tf.initialize_all_variables()
# Create a session for running Ops on the Graph.
sess = tf.Session()
# Run the Op to initialize the variables.
sess.run(init_op)
sess.run(input_images.initializer,
feed_dict={images_initializer: my_list})
sess.run(input_alph.initializer,
feed_dict={alph_initializer: my_list_val})
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
# And then after everything is built, start the training loop.
collection = []
try:
step = 0
while not coord.should_stop():
start_time = time.time()
# Run one step of the model.
integer = sess.run(image)
#print("Integer val", integer)
char = sess.run(alph)
collection.append(char[0][0])
print("String val", char)
duration = time.time() - start_time
except tf.errors.OutOfRangeError:
print('Saving')
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
print(str(collection))
# Wait for threads to finish.
coord.join(threads)
sess.close()
def main(_):
run_training()
if __name__ == '__main__':
tf.app.run()
Changing the above to the below clears up my confusion
try:
step = 0
while not coord.should_stop():
start_time = time.time()
# Run one step of the model.
integer = sess.run(images)
#print("Integer val", integer)
char = sess.run(alphs)
collection.append(char[0][0])
print("String val", char)
duration = time.time() - start_time
except tf.errors.OutOfRangeError:
print('Saving')
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
print(str(collection))

No classification done after passing an image to the model in Tensorflow

I am trying to pass an image to the model that i have created by following the 2_fullyconnected.ipynb udacity assignment.
The code in which i have created the model is shown below .
# In[1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
# First reload the data we generated in `1_notmnist.ipynb`.
# In[2]:
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
print(train_dataset[0])
print(train_labels[0])
# Reformat into a shape that's more adapted to the models we're going to train:
# - data as a flat matrix,
# - labels as float 1-hot encodings.
# In[3]:
image_size = 28
num_labels = 10
def reformat(dataset, labels):
print(type(dataset))
#print(dataset[0])
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
#stochastic gradient descent training
# In[7]:
batch_size = 128
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_labels]),name = "weights")
biases = tf.Variable(tf.zeros([num_labels]),name ="biases")
# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
# In[9]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
# Let's run it:
# In[10]:
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
save_path = tf.train.Saver().save(session, "/tmp/important_model/model.ckpt")
print("Model saved in file: %s" % save_path)
The model is saved in /tmp/important_model/.
Tree structure for that folder is as follows:
important_model/
|-- checkpoint
|-- model.ckpt
`-- model.ckpt.meta
Now i am creating a new file in which i am trying to restore my model and then pass an image to the model for classification .
I have created the graph in the new python file as well , which is necessary for restoring the model (I think, I could be wrong. please correct me if i am wrong).
# In[16]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
from scipy import ndimage
# In[17]:
image_size = 28
num_labels = 10
# In[25]:
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
#train_subset = 1000
batch_size = 1
graph = tf.Graph()
with graph.as_default():
# Variables.
# These are the parameters that we are going to be training. The weight
# matrix will be initialized using random valued following a (truncated)
# normal distribution. The biases get initialized to zero.
# Variables.
#saver = tf.train.Saver()
weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_labels]),name = "weights")
biases = tf.Variable(tf.zeros([num_labels]),name ="biases")
tf_valid_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
valid_prediction = tf.nn.softmax(
tf.matmul(tf_valid_dataset, weights) + biases)
# In[26]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
# In[34]:
pixel_depth = 255
image_data = (ndimage.imread('notMNIST_small/A/QXJyaWJhQXJyaWJhU3RkLm90Zg==.png').astype(float) -
pixel_depth / 2) / pixel_depth
print(image_data.shape)
resized_data = image_data.reshape((-1,784))
print(resized_data.shape)
with tf.Session(graph=graph) as session:
tf.train.Saver().restore(session, "/tmp/important_model/model.ckpt")
print("Model restored.")
session.run(valid_prediction,feed_dict={tf_valid_dataset:resized_data})
When i am executing ln[34] in this ipython notebookthe output that is coming is :
(28, 28)
(1, 784)
Model restored
I want to tell the 5 probable labels which the given image may belong to but don't know how to do it , The above program doesn't shows any error but neither shows the desired output . I thought i will get the probabilities of the image being in all classes as i have passed my image in tf.nn.softmax function but unfortunately not getting anything .
Any help would be appreciated.
The following line in your code computes a probability distribution across the possible output labels for each image in your data set (in this case a single image):
session.run(valid_prediction,feed_dict={tf_valid_dataset:resized_data})
The result of this method is a NumPy array of shape (1, 10). To see the probabilities, you can simply print the array:
result = session.run(valid_prediction,feed_dict={tf_valid_dataset:resized_data})
print(result)
There are many ways that you can get the top k predictions for your image. One of the easiest is to use TensorFlow's tf.nn.top_k() operator when defining your graph:
valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
top_5_labels = tf.nn.top_k(valid_prediction, k=5)
# ...
result = session.run(top_5_labels, feed_dict={tf_valid_dataset: resized_data})
print(result)