TensorFlow: No decrease in CTC loss while training BLSTM - tensorflow

I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)

After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.

The problem is that you are feeding raw images in the LSTM, so it is very difficult for it to extract any useful information. The CRNN paper first uses a series of convolutional layers to extract features from the images, and then these are fed into the LSTM.

Related

L2 regularization keep increasing during training

I am finetuning InceptionResnetV2 on TensorFlow. When training, the regularization loss keep linearly increasing and even much larger than cross entropy loss in the later stage of training. I have checked the training procedure, and make sure I am optimizing the cross entropy loss and L2 loss combined.
Is there anyone explain this weird thing a little bit? Any feedback is appreciated.
Here is the code and some TensorBoard plots.
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
from preprocessing import aug_parallel_v2
import numpy as np
slim = tf.contrib.slim
# total training data number
sample_num = 625020
data_path = 'iNaturalist_train.tfrecords'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log_v5'
# tensorboard visualization path
filewriter_path = './filewriter_v5_Logits'
# State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
checkpoint_save_addr = './log_v5/fine-tuning_v5.ckpt'
# State the image size you're resizing your images to. We will use the default inception size of 299.
image_size = 299
# State the number of classes to predict:
num_classes = 8142
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 5
# State your batch size
batch_size = 60
# Learning rate information and configuration
initial_learning_rate = 0.0005
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 2
# put weight on different classes inversely proportional
# to total number of their image samples
label_count = np.loadtxt('label_count.txt', dtype=int)
inverse = lambda t: 1 / t
vfunc = np.vectorize(inverse)
multiplier = vfunc(label_count)
multiplier /= np.mean(multiplier)
def run():
if not os.path.exists(log_dir):
os.mkdir(log_dir)
feature = {'train/height': tf.FixedLenFeature([], tf.int64),
'train/width': tf.FixedLenFeature([], tf.int64),
'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64),
'train/sup_label': tf.FixedLenFeature([], tf.int64),
'train/aug_level': tf.FixedLenFeature([], tf.int64)}
# create a list of file names
filename_queue = tf.train.string_input_producer([data_path], num_epochs=None)
print(filename_queue)
reader = tf.TFRecordReader()
_, tfrecord_serialized = reader.read(filename_queue)
features = tf.parse_single_example(tfrecord_serialized, features=feature)
# Convert the image data from string back to the numbers
height = tf.cast(features['train/height'], tf.int64)
width = tf.cast(features['train/width'], tf.int64)
# change this line for your TFrecord version
tf_image = tf.image.decode_jpeg(features['train/image'])
tf_label = tf.cast(features['train/label'], tf.int32)
aug_level = tf.cast(features['train/aug_level'], tf.int32)
# tf_sup_label = tf.cast(features['train/sup_label'], tf.int64)
tf_image = tf.reshape(tf_image, tf.stack([height, width, 3]))
tf_label = tf.reshape(tf_label, [1])
aug_level = tf.reshape(aug_level, [1])
resized_image = tf.image.resize_images(images=tf_image, size=tf.constant([400, 400]), method=2)
resized_image = tf.cast(resized_image, tf.uint8)
tf_images, tf_labels, tf_aug = tf.train.shuffle_batch([resized_image, tf_label, aug_level], batch_size=batch_size,
capacity=2048, num_threads=16, allow_smaller_final_batch=False,
min_after_dequeue=256)
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
IMAGE_HEIGHT = 299
IMAGE_WIDTH = 299
images = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3])
labels = tf.placeholder(dtype=tf.int32, shape=[None, 1])
weighted_level = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(sample_num / batch_size)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(images, num_classes=num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
print("label test")
print(labels)
print(logits)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = tf.squeeze(tf.one_hot(labels, num_classes), [1])
print(one_hot_labels)
print(logits)
weighted_onehot = tf.multiply(one_hot_labels, weighted_level)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
digits_loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_onehot, logits=logits)
reg_loss = tf.losses.get_regularization_loss()
total_loss = digits_loss + reg_loss
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# train_vars = []
# Now we can define the optimizer that takes on the learning rate
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
"InceptionResnetV2/Logits")
# RMSProp or Adam
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer, variables_to_train=train_vars)
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.metrics.accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
tf.summary.scalar('losses/Reg_Loss', reg_loss)
tf.summary.scalar('losses/Digit_Loss', digits_loss)
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
writer = tf.summary.FileWriter(filewriter_path)
writer.add_graph(tf.get_default_graph())
my_summary_op = tf.summary.merge_all()
def train_step(sess, train_op, global_step, imgs, lbls, weight):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed
for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: digit_loss: %.4f (%.2f sec/step)',
global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
saver_pretrain = tf.train.Saver(variables_to_restore)
saver_train = tf.train.Saver(train_vars)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
# Create a coordinator and run all QueueRunner objects
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
saver_pretrain.restore(sess, checkpoint_file)
start_time = time.time()
for step in range(int(num_steps_per_epoch * num_epochs)):
imgs, lbls, augs = sess.run([tf_images, tf_labels, tf_aug])
imgs, lbls = aug_parallel_v2(imgs, lbls, augs)
imgs = imgs[:, 50:349, 50:349, :]
imgs = 2*(imgs.astype(np.float32)) - 1
lbls = lbls.astype(np.int32)
weight = multiplier[lbls]
weight = np.array(weight).reshape((batch_size, 1))
# print(imgs[0, 0:10, 0:10, 0:2])
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
print('logits: \n', logits_value)
print('Probabilities: \n', probabilities_value)
print('predictions: \n', predictions_value)
print('Labels:\n:', labels_value)
# Log the summaries every 10 step.
if step % 20 == 0:
loss, global_step_count = train_step(sess, train_op, global_step, imgs, lbls, weight)
summaries = sess.run(my_summary_op, feed_dict={images: imgs, labels: lbls, weighted_level: weight})
writer.add_summary(summaries, global_step_count)
# sess.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, global_step, imgs, lbls, weight)
if step % 2000 == 0:
logging.info('Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
print('one batch time: ', time.time() - start_time)
start_time = time.time()
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
# Stop the threads
coord.request_stop()
# Wait for threads to stop
coord.join(threads)
sess.close()
if __name__ == '__main__':
run()
I am new here, and don't have enough reputation to post images.
Here are two links for the accuracy plot and losses plot. You can easily tell the regularization loss is in a dominant position.
This is a difficult question to answer. I can give some pointers though.
In general, when you try to minimize digits_loss, that is to fit your model to your data, you will slowly change the weights in your layers. To counter potential overfitting, a L2 regularization loss (the sum of the squares of all weights, reg_loss in your code) is generally added to the overall loss (total_loss in your code.) These two forces generally act against each other and if the balance is right, you train a good model.
In your case you're taking a network (resnet_v2) that was developed for 1,001 classes and try to predict 8,142 classes. No problem with that per se, but you're upsetting the balance. So I believe you need to override the default weight decay of 0.00004 for resnet v2 to some higher value, in this line (note only 3 zeros in the decimals for a 10x increase):
with slim.arg_scope( inception_resnet_v2_arg_scope( weight_decay = 0.0004 ) ):
A higher weight_decay parameter will force the L2 loss to decrease faster. The problem is that this number is just a guess, I have no idea what an ideal value would be. You need to experiment with multiple values and figure it out.

Issue exporting trained Tensorflow model parameters to SavedModel format

I have built a system that leverages Google ML Engine to train various text classifiers using a simple flat CNN architecture (borrowed from the excellent WildML post). I've also leveraged heavily the ML Engine trainer template which exists here - specifically using the Tensorflow core functions.
My issue is that while the model trains and learns parameters correctly, I cannot get the serialized export in the binary SavedModel format (i.e. - the .pb files) to maintain the learned weights. I can tell this by using the gcloud predict local API on the model export folder and each time it makes randomized predictions - leading me to believe that while the graph structure is being saved to the proto-buf format, the associated weights in the checkpoint file are not being carried over.
Here's the code for my run function:
def run(...):
# ... code to load and transform train/test data
with train_graph.as_default():
with tf.Session(graph=train_graph).as_default() as session:
# Features and label tensors as read using filename queue
features, labels = model.input_fn(
x_train,
y_train,
num_epochs=num_epochs,
batch_size=train_batch_size
)
# Returns the training graph and global step tensor
tf.logging.info("Train vocab size: {:d}".format(vocab_size))
train_op, global_step_tensor, cnn, train_summaries = model.model_fn(
model.TRAIN,
sequence_length,
num_classes,
label_values,
vocab_size,
embedding_size,
filter_sizes,
num_filters
)
tf.logging.info("Created simple training CNN with ({}) filter types".format(filter_sizes))
# Setup writers
train_summary_op = tf.summary.merge(train_summaries)
train_summary_dir = os.path.join(job_dir, "summaries", "train")
# Generate writer
train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph)
# Initialize all variables
session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer())
model_dir = os.path.abspath(os.path.join(job_dir, "model"))
if not os.path.exists(model_dir):
os.makedirs(model_dir)
saver = tf.train.Saver()
def train_step(x_batch, y_batch):
"""
A single training step
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 0.5
}
step, _, loss, accuracy = session.run([global_step_tensor, train_op, cnn.loss, cnn.accuracy],
feed_dict=feed_dict)
time_str = datetime.datetime.now().isoformat()
if step % 10 == 0:
tf.logging.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
# Return current step
return step
def eval_step(x_batch, y_batch, train_step, total_steps):
"""
Evaluates model on a dev set
"""
feed_dict = {
cnn.input_x: x_batch,
cnn.input_y: y_batch,
cnn.dropout_keep_prob: 1.0
}
step, loss, accuracy, scores, predictions = session.run([global_step_tensor, cnn.loss, cnn.accuracy, cnn.scores, cnn.predictions],
feed_dict=feed_dict)
# Get metrics
y_actual = np.argmax(y_batch, 1)
model_metrics = precision_recall_fscore_support(y_actual, predictions)
#print(scores)
time_str = datetime.datetime.now().isoformat()
print("\n---- EVAULATION ----")
avg_precision = np.mean(model_metrics[0], axis=0)
avg_recall = np.mean(model_metrics[1], axis=0)
avg_f1 = np.mean(model_metrics[2], axis=0)
print("{}: step {}, loss {:g}, acc {:g}, prec {:g}, rec {:g}, f1 {:g}".format(time_str, step, loss, accuracy, avg_precision, avg_recall, avg_f1))
print("Model metrics: ", model_metrics)
print("---- EVALUATION ----\n")
# Generate batches
batches = data_helpers.batch_iter(
list(zip(features, labels)), train_batch_size, num_epochs)
# Training loop. For each batch...
for batch in batches:
x_batch, y_batch = zip(*batch)
current_step = train_step(x_batch, y_batch)
if current_step % 20 == 0 or current_step == 1:
eval_step(x_eval, y_eval, current_step, total_steps)
# Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
print(model_dir)
trained_model = saver.save(session, os.path.join(job_dir, 'model') + "/model.ckpt", global_step=current_step)
print(trained_model)
print("Saved final model checkpoint to {}".format(trained_model))
# Only perform this if chief
if is_chief:
build_and_run_exports(trained_model, job_dir,
model.SERVING_INPUT_FUNCTIONS[model.TEXT],
sequence_length, num_classes, label_values,
vocab_size, embedding_size, filter_sizes,
num_filters, vocab_processor)
And my build_and_run_exports function:
def build_and_run_exports(...):
# Check if we export already exists - if so delete
export_dir = os.path.join(job_dir, 'export')
if os.path.exists(export_dir):
print("Export currently exists - going to delete:", export_dir)
shutil.rmtree(export_dir)
# Create exporter
exporter = tf.saved_model.builder.SavedModelBuilder(export_dir)
# Restore prediction graph
prediction_graph = tf.Graph()
with prediction_graph.as_default():
with tf.Session(graph=prediction_graph) as session:
# Get training data
features, inputs_dict = serving_input_fn()
# Setup inputs
inputs_info = {
name: tf.saved_model.utils.build_tensor_info(tensor)
for name, tensor in inputs_dict.iteritems()
}
# Load model
cnn = TextCNN(
sequence_length=sequence_length,
num_classes=num_classes,
vocab_size=vocab_size,
embedding_size=embedding_size,
filter_sizes=list(map(int, filter_sizes.split(","))),
num_filters=num_filters,
input_tensor=features)
# Restore model
saver = tf.train.Saver()
saver.restore(session, latest_checkpoint)
# Setup outputs
outputs = {
'logits': cnn.scores,
'probabilities': cnn.probabilities,
'predicted_indices': cnn.predictions
}
# Create output info
output_info = {
name: tf.saved_model.utils.build_tensor_info(tensor)
for name, tensor in outputs.iteritems()
}
# Setup signature definition
signature_def = tf.saved_model.signature_def_utils.build_signature_def(
inputs=inputs_info,
outputs=output_info,
method_name=sig_constants.PREDICT_METHOD_NAME
)
# Create graph export
exporter.add_meta_graph_and_variables(
session,
tags=[tf.saved_model.tag_constants.SERVING],
signature_def_map={
sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
},
legacy_init_op=tf.saved_model.main_op.main_op()
)
# Export model
exporter.save()
And last, but not least, the TextCNN model:
class TextCNN(object):
"""
A CNN for text classification.
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
"""
def __init__(
self, sequence_length, num_classes, vocab_size,
embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0,
dropout_keep_prob=0.5, input_tensor=None):
# Setup input
if input_tensor != None:
self.input_x = input_tensor
self.dropout_keep_prob = tf.constant(1.0)
else:
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
# Placeholders for input, output and dropout
self.input_y = tf.placeholder(tf.int32, [None, num_classes], name="input_y")
# Keeping track of l2 regularization loss (optional)
l2_loss = tf.constant(0.0)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
self.W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
# Create a convolution + maxpool layer for each filter size
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# Apply nonlinearity
h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# Maxpooling over the outputs
pooled = tf.nn.max_pool(
h,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
pooled_outputs.append(pooled)
# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
# Add dropout
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
W = tf.get_variable(
"W",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
self.predictions = tf.argmax(self.scores, 1, name="predictions")
# CalculateMean cross-entropy loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
with tf.name_scope("probabilities"):
self.probabilities = tf.nn.softmax(logits=self.scores)
# Accuracy
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
I'm hoping I'm just missing something simple in how I'm creating the TF graph / session and restoring stats.
Thank you in advance for your help!
This behavior is caused due to the behavior of tf.saved_model.main_op.main_op() which randomly initializes all of the variables in the graph (code). However, legacy_init_op happens after the variables are restored from the checkpoint (restore happens here followed by legacy_init_op here).
The solution is simply to not re-initialize all of the variables, for example, in your code:
from tensorflow.python.ops import variables
from tensorflow.python.ops import lookup_ops
from tensorflow.python.ops import control_flow_ops
def my_main_op():
init_local = variables.local_variables_initializer()
init_tables = lookup_ops.tables_initializer()
return control_flow_ops.group(init_local, init_tables)
def build_and_run_exports(...):
...
# Create graph export
exporter.add_meta_graph_and_variables(
session,
tags=[tf.saved_model.tag_constants.SERVING],
signature_def_map={
sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def
},
legacy_init_op=my_main_op()
)
# Export model
exporter.save()

Tensorflow neural network has high error even in really easy dataset

I'm trying to implement a 1 hidden layer NN for a regression problem. The loss function improves for a few iterations than it gets stuck on a really high error even for a very easy data. Could someone help me find the bug? Here is my code:
import tensorflow as tf
import scipy.io as sio
import numpy as np
reuse_weights = 1
n_nodes_hl1 = 10
batch_size = 200
hm_epochs = 20
# load input from matlab
input_training = sio.loadmat('xMat.mat')
input_training = input_training['xMat']
input_test = sio.loadmat('xMat.mat')
input_test = input_test['xMat']
# find number of measurements and input length
n_measurements = input_training.shape[0]
input_length = input_training.shape[1]
# current input
data_y = input_training[:, input_length - 1].astype(float)
data_x = input_training[:, 0 : input_length - 1].astype(float)
test_data_y = input_test[:, input_length - 1].astype(float)
test_data_x = input_test[:, 0 : input_length - 1].astype(float)
x = tf.placeholder('float32',[None, input_length - 1])
y = tf.placeholder('float32')
# place holder for Dropout algorithm drop probability
keep_prob = tf.placeholder('float32')
def next_batch(data):
"""
Return a total of `batch_size` samples from the array `data`.
"""
if len(data.shape) == 2:
idx = np.arange(0, len(data[:,0])) # get all possible indexes
else:
idx = np.arange(0, len(data)) # get all possible indexes
np.random.shuffle(idx) # shuffle indexes
idx = idx[0:batch_size] # use only `batch_size` random indexes
if len(data.shape) == 2:
data_shuffle = [data[i,:] for i in idx] # get list of `batch_size` random samples
else:
data_shuffle = [data[i] for i in idx] # get list of `batch_size` random samples
data_shuffle = np.asarray(data_shuffle) # get back numpy array
return data_shuffle
def neural_network_model(data, weights, biases, keep_prob):
layer1 = tf.add(tf.matmul(data, weights['h1']), biases['b1'])
layer1 = tf.nn.sigmoid(layer1)
output = tf.add(tf.matmul(layer1, weights['out']), biases['out'])
return output
if reuse_weights:
weights = {
'h1': tf.Variable(sio.loadmat('weights_h1.mat')['weights_h1'], name="weights_h1"),
'out': tf.Variable(sio.loadmat('weights_out.mat')['weights_out'], name="weights_out")
}
biases = {
'b1': tf.Variable(sio.loadmat('biases_b1.mat')['biases_b1'], name="biases_b1"),
'out': tf.Variable(sio.loadmat('biases_out.mat')['biases_out'], name="biases_out")
}
else: # initialize weights
weights = {
'h1': tf.Variable(tf.random_normal([input_length - 1, n_nodes_hl1]), name="weights_h1"),
'out': tf.Variable(tf.random_normal([n_nodes_hl1, 1]), name="weights_out")
}
biases = {
'b1': tf.Variable(tf.random_normal([n_nodes_hl1]), name="biases_b1"),
'out': tf.Variable(tf.random_normal([1]), name="biases_out")
}
def train_neural_network(x):
prediction = neural_network_model(x, weights, biases, keep_prob)[:,0]
cost = tf.reduce_mean(tf.abs(prediction - y))
optimizer = tf.train.AdamOptimizer()
opt = optimizer.minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print(weights['h1'])
for epoch in range(hm_epochs): #training
epoch_loss = 0
for _ in range(int(n_measurements/batch_size)):
_, c, p = sess.run([opt, cost, prediction], feed_dict = {x:next_batch(data_x),\
y:next_batch(data_y) , keep_prob : 1.0})
epoch_loss += c
print('Epoch', epoch, 'completed out of', hm_epochs, 'Average loss:', epoch_loss/int(n_measurements/batch_size))
# prediction
accuracy = tf.reduce_mean(tf.abs(prediction - y))
# Feed 1.0 for keep prob during testing
print("Training data accuracy:", accuracy.eval({x: data_x, y: data_y, keep_prob : 1.0}))
print("Training data predictions:", prediction.eval({x: data_x[0:5,:], keep_prob : 1.0}))
print("Training data:",data_y[0:5])
#print("Test data accuracy:", accuracy.eval({x: test_data_x, y: test_data_y, keep_prob : 1.0}))
# save numpy arrays
sio.savemat('weights_h1.mat', {'weights_h1': weights['h1'].eval()})
sio.savemat('biases_b1.mat', {'biases_b1': biases['b1'].eval()})
sio.savemat('weights_out.mat', {'weights_out': weights['out'].eval()})
sio.savemat('biases_out.mat', {'biases_out': biases['out'].eval()})
train_neural_network(x)
Figured it out, the problem was with the data shuffling. The input and response were shuffled differently (two times random shuffle for each epoch) and thus the input data in each epoch did not correspond to the response data.

Tensorflow does not train CIFAR - 100 data

I am trying to build a linear classifier with CIFAR - 100 using TensorFlow. I got the code from Martin Gorner's MNIST tutorial and change a bit. When I run this code, tensorflow does not training (code is running but accuracy remains 1.0 and loss(cross entropy remains as 4605.17), I don't know what is wrong, I am actually newbie to TF any help is appreciated.
import pickle
import numpy as np
import os
import tensorflow as tf
from tensorflow.python.framework import tensor_util
import math
#imports data
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
return dict
cifar100_test = {}
cifar100_train = {}
labelMap = {}
labelNames = {}
# Load the raw CIFAR-10 data.
cifar100_test = unpickle('dataset/cifar-100-python/test')
cifar100_train = unpickle('dataset/cifar-100-python/train')
labelMap = unpickle('dataset/cifar-100-python/meta')
#tr for training data and te for testing data, X is data, Y is label
Xtr = cifar100_train[b'data']
Yr = cifar100_train[b'fine_labels']
Xte = cifar100_test[b'data']
Ye = cifar100_test[b'fine_labels']
classNames = labelMap[b'fine_label_names']
num_train = Xtr.shape[0]
num_test = Xte.shape[0]
num_class = len(classNames)
Ytr = np.zeros([num_train, num_class])
Yte = np.zeros([num_test, num_class])
Ytr[0:num_train, Yr[0:num_train]] = 1
Yte[0:num_test, Ye[0:num_test]] = 1
# As a sanity check, we print out the size of the training and test data.
print('Train data shape:', Xtr.shape)
print('Train Label shape:', Ytr.shape)
print('Test data shape:', Xte.shape)
print('Test Label shape:', Yte.shape)
print('Name of Predicted Class:', classNames[0]) #indice of the label name is the indice of the class.
Xtrain = Xtr#[:1000]
Xtest = Xte#[:100]
Ytrain = Ytr#[:1000]
Ytest = Yte#[:100]
print('Train data shape:', Xtrain.shape)
print('Train Label shape:', Ytrain.shape)
print('Test data shape:', Xtest.shape)
print('Test Label shape:', Ytest.shape)
Xtrain = np.reshape(Xtrain,(50000, 32, 32, 3)).transpose(0,1,2,3).astype(float)
Xtest = np.reshape(Xtest,(10000, 32, 32, 3)).transpose(0,1,2,3).astype(float)
Xbatches = np.split(Xtrain, 500); #second number is # of batches
Ybatches = np.split(np.asarray(Ytrain), 500);
XtestB = np.split(Xtest, 100);
YtestB = np.split(Ytest, 100);
print('X # of batches:', len(Xbatches))
print('Y # of batches:', len(Ybatches))
# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
X = tf.placeholder(tf.float32, [100, 32, 32, 3])
# correct answers will go here
Y_ = tf.placeholder(tf.float32, [100, 100])
# weights W[784, 10] 784=28*28
W = tf.Variable(tf.zeros([3072, 100]))
# biases b[10]
b = tf.Variable(tf.zeros([100]))
# flatten the images into a single line of pixels
# -1 in the shape definition means "the only possible dimension that will preserve the number of elements"
XX = tf.reshape(X, [-1, 3072])
# The model
Y = tf.nn.softmax(tf.matmul(XX, W) + b)
# loss function: cross-entropy = - sum( Y_i * log(Yi) )
# Y: the computed output vector
# Y_: the desired output vector
# cross-entropy
# log takes the log of each element, * multiplies the tensors element by element
# reduce_mean will add all the components in the tensor
# so here we end up with the total cross-entropy for all images in the batch
cross_entropy = -tf.reduce_mean(Y_ * tf.log(Y)) * 1000.0 # normalized for batches of 100 images,
# *10 because "mean" included an unwanted division by 10
# accuracy of the trained model, between 0 (worst) and 1 (best)
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# training, learning rate = 0.005
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# init
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for i in range(500):
# the backpropagation training step
t, Loss = sess.run([train_step, cross_entropy], feed_dict={X: Xbatches[i], Y_: Ybatches[i]})
print(Loss)
print(i)
for i in range(100):
print('accuracy:', sess.run(accuracy, feed_dict={X: XtestB[i], Y_: YtestB[i]}))
You compute the accuracy a hundred times after the training process is completed. Nothing will change there. You should place your print('accuracy:'....) within the for loop in which you perform the backpropagation:
for i in range(500):
# the backpropagation training step
t, Loss = sess.run([train_step, cross_entropy], feed_dict={X: Xbatches[i], Y_: Ybatches[i]})
print(Loss)
print(i)
print('accuracy:', sess.run(accuracy, feed_dict={X: XtestB[i], Y_: YtestB[i]}))
Sorry for the post it turns out that it is a basic mistake.
I changed following;
Ytr[0:num_train, Yr[0:num_train]] = 1
Yte[0:num_test, Ye[0:num_test]] = 1
with
Ytr[range(num_train), Yr_temp[range(num_train)]] = 1
Yte[range(num_test), Ye_temp[range(num_test)]] = 1
First one make all values 1, but I just wanted to make indice of the true class 1 and other elements 0. Thanks for your time.

My tensorboard events appear many charts I did not summary

I only summary my loss as 'xentropy_mean' in training() ,but in tensorboard ,I had not find the 'xentropy_mean' chart but many other charts I did not defined. I don't know where I wrote wrong, and what's the matter indeed. Is it because I use thread in my code? If I don't use thread, how should I wrote it?
The tensorboard screenshot
There are 6 charts under the queue,I don't know what are the meanings either
I create the model in the file below
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow.python.platform
import tensorflow as tf
# The MNIST dataset has 10 classes, representing the digits 0 through 9.
NUM_CLASSES = 16
# The MNIST images are always 28x28 pixels.
IMAGE_SIZE = 28
IMAGE_PIXELS = 784
def inference(images, hidden1_units, hidden2_units):
"""Build the MNIST model up to where it may be used for inference.
Args:
images: Images placeholder, from inputs().
hidden1_units: Size of the first hidden layer.
hidden2_units: Size of the second hidden layer.
Returns:
softmax_linear: Output tensor with the computed logits.
"""
# Hidden 1
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
name='weights')
biases = tf.Variable(tf.zeros([hidden1_units]),
name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
# Hidden 2
with tf.name_scope('hidden2'):
weights = tf.Variable(
tf.truncated_normal([hidden1_units, hidden2_units],
stddev=1.0 / math.sqrt(float(hidden1_units))),
name='weights')
biases = tf.Variable(tf.zeros([hidden2_units]),
name='biases')
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
# Linear
with tf.name_scope('softmax_linear'):
weights = tf.Variable(
tf.truncated_normal([hidden2_units, NUM_CLASSES],
stddev=1.0 / math.sqrt(float(hidden2_units))),
name='weights')
biases = tf.Variable(tf.zeros([NUM_CLASSES]),
name='biases')
logits = tf.matmul(hidden2, weights) + biases
return logits
def loss(logits, labels):
batch_size = tf.size(labels)
#print('batch size %d' %(batch_size))
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size), 1)
concated = tf.concat(1, [indices, labels])
#print('Done2')
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([batch_size, 16]), 1.0, 0.0)
#print('Done1')
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
onehot_labels,
name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
tf.summary.scalar(loss.op.name, loss)
return loss
def training(loss, learning_rate):
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
global_step=tf.Variable(0,name='global_step',trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
def evaluation(logits, labels):
correct = tf.nn.in_top_k(logits, labels, 1)
# Return the number of true entries.
return tf.reduce_sum(tf.cast(correct, tf.int32))
and train the model in this file:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import time
import numpy as np
import tensorflow as tf
import mnist
# Basic model parameters as external flags.
#FLAGS = None
# Constants used for dealing with the files, matches convert_to_records.
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'
TEST_FILE='test.tfrecords'
flags = tf.app.flags
FLAGS = flags.FLAGS
#FLAGS = None
flags.DEFINE_string('train_dir', '/home/queenie/image2tfrecord/tfrecords-28-gray/', 'Directory to put the training data.')
flags.DEFINE_string('filename', 'train.tfrecords', 'Directory to put the training data.')
flags.DEFINE_integer('batch_size', 100, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('num_epochs', None, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('hidden1', 128,'balabala')
flags.DEFINE_integer('hidden2', 32,'balabala')
flags.DEFINE_integer('learning_rate', 0.01,'balabala')
flags.DEFINE_integer('max_steps', 50000,'balabala')
def placeholder_inputs(batch_size):
images_placeholder=tf.placeholder(tf.float32,shape=(batch_size,mnist.IMAGE_PIXELS))
labels_placeholder=tf.placeholder(tf.int32,shape=(batch_size))
return images_placeholder,labels_placeholder
def fill_feed_dict(images_feed,labels_feed,images_pl,labels_pl):
feed_dict={
images_pl:images_feed,
labels_pl:labels_feed,
}
return feed_dict
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
# Convert from a scalar string tensor (whose single string has
# length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
# [mnist.IMAGE_PIXELS].
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([mnist.IMAGE_PIXELS])
# OPTIONAL: Could reshape into a 28x28 image and apply distortions
# here. Since we are not applying any distortions in this
# example, and the next step expects the image to be flattened
# into a vector, we don't bother.
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
# Convert label from a scalar uint8 tensor to an int32 scalar.
label = tf.cast(features['label'], tf.int32)
return image, label
def do_eval(sess,eval_correct):
true_count=0
for step in xrange(FLAGS.batch_size):
#print(sess.run(eval_correct))
true_count+=sess.run(eval_correct)
precision=float(true_count)/FLAGS.batch_size/FLAGS.batch_size
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' %
(FLAGS.batch_size, true_count, precision))
return precision
def inputs(train, batch_size, num_epochs):
if not num_epochs: num_epochs = None
if train=='train':
filename=os.path.join(FLAGS.train_dir,TRAIN_FILE)
elif train=='validation':
filename=os.path.join(FLAGS.train_dir,VALIDATION_FILE)
else:
filename=os.path.join(FLAGS.train_dir,TEST_FILE)
# filename = os.path.join(FLAGS.train_dir,
# TRAIN_FILE if train else VALIDATION_FILE)
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=None)
# Even when reading in multiple threads, share the filename
# queue.
image, label = read_and_decode(filename_queue)
# Shuffle the examples and collect them into batch_size batches.
# (Internally uses a RandomShuffleQueue.)
# We run this in two threads to avoid being a bottleneck.
images, sparse_labels = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=1000)
return images, sparse_labels
def run_training():
with tf.Graph().as_default():
# Build a Graph that computes predictions from the inference model.
images, labels = inputs(train='train', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_valid,labels_valid=inputs(train='validation', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_test,labels_test=inputs(train='test', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
logits = mnist.inference(images,
FLAGS.hidden1,
FLAGS.hidden2)
# Add to the Graph the loss calculation.
valid_prediction=mnist.inference(images_valid,FLAGS.hidden1,FLAGS.hidden2)
test_prediction=mnist.inference(images_test,FLAGS.hidden1,FLAGS.hidden2)
loss = mnist.loss(logits, labels)
# Add to the Graph operations that train the model.
train_op = mnist.training(loss, FLAGS.learning_rate)
eval_correct=mnist.evaluation(logits,labels)
eval_correct_valid=mnist.evaluation(valid_prediction,labels_valid)
eval_correct_test=mnist.evaluation(test_prediction,labels_test)
summary_op=tf.merge_all_summaries()
# The op for initializing the variables.
init_op = tf.group(tf.initialize_all_variables(),
tf.initialize_local_variables())
saver = tf.train.Saver()
# Create a session for running operations in the Graph.
sess = tf.Session()
# Initialize the variables (the trained variables and the
# epoch counter).
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
train_precision=0
validation_precision=0
test_precision=0
#while not coord.should_stop():
while not coord.should_stop():
start_time = time.time()
_, loss_value,images_see,labels_see = sess.run([train_op, loss,images,labels])
#print('run done')
duration = time.time() - start_time
# Print an overview fairly often.
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
duration))
precision_tr=do_eval(sess,eval_correct)
summary_str=sess.run(summary_op)
summary_writer.add_summary(summary_str,step)
if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
print('Train:')
do_eval(sess,eval_correct)
print('Validation:')
do_eval(sess,eval_correct_valid)
print('Test:')
do_eval(sess,eval_correct_test)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
sess.close()
run_training()
then I get the tensorboard like these,6 charts about queue.
The tensorboard screenshot
The queue charts you are seeing are created by default from shuffle_batch and friends, and can be used to monitor the performance of your input pipeline (you'll ideally want all the queues to stay at capacity, as that means your GPU isn't blocking on input reading).
I don't understand why your summary isn't showing in tensorboard. Can I get more information?