I am finetuning InceptionResnetV2 on TensorFlow. When training, the regularization loss keep linearly increasing and even much larger than cross entropy loss in the later stage of training. I have checked the training procedure, and make sure I am optimizing the cross entropy loss and L2 loss combined.
Is there anyone explain this weird thing a little bit? Any feedback is appreciated.
Here is the code and some TensorBoard plots.
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
from preprocessing import aug_parallel_v2
import numpy as np
slim = tf.contrib.slim
# total training data number
sample_num = 625020
data_path = 'iNaturalist_train.tfrecords'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log_v5'
# tensorboard visualization path
filewriter_path = './filewriter_v5_Logits'
# State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
checkpoint_save_addr = './log_v5/fine-tuning_v5.ckpt'
# State the image size you're resizing your images to. We will use the default inception size of 299.
image_size = 299
# State the number of classes to predict:
num_classes = 8142
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 5
# State your batch size
batch_size = 60
# Learning rate information and configuration
initial_learning_rate = 0.0005
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 2
# put weight on different classes inversely proportional
# to total number of their image samples
label_count = np.loadtxt('label_count.txt', dtype=int)
inverse = lambda t: 1 / t
vfunc = np.vectorize(inverse)
multiplier = vfunc(label_count)
multiplier /= np.mean(multiplier)
def run():
if not os.path.exists(log_dir):
os.mkdir(log_dir)
feature = {'train/height': tf.FixedLenFeature([], tf.int64),
'train/width': tf.FixedLenFeature([], tf.int64),
'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64),
'train/sup_label': tf.FixedLenFeature([], tf.int64),
'train/aug_level': tf.FixedLenFeature([], tf.int64)}
# create a list of file names
filename_queue = tf.train.string_input_producer([data_path], num_epochs=None)
print(filename_queue)
reader = tf.TFRecordReader()
_, tfrecord_serialized = reader.read(filename_queue)
features = tf.parse_single_example(tfrecord_serialized, features=feature)
# Convert the image data from string back to the numbers
height = tf.cast(features['train/height'], tf.int64)
width = tf.cast(features['train/width'], tf.int64)
# change this line for your TFrecord version
tf_image = tf.image.decode_jpeg(features['train/image'])
tf_label = tf.cast(features['train/label'], tf.int32)
aug_level = tf.cast(features['train/aug_level'], tf.int32)
# tf_sup_label = tf.cast(features['train/sup_label'], tf.int64)
tf_image = tf.reshape(tf_image, tf.stack([height, width, 3]))
tf_label = tf.reshape(tf_label, [1])
aug_level = tf.reshape(aug_level, [1])
resized_image = tf.image.resize_images(images=tf_image, size=tf.constant([400, 400]), method=2)
resized_image = tf.cast(resized_image, tf.uint8)
tf_images, tf_labels, tf_aug = tf.train.shuffle_batch([resized_image, tf_label, aug_level], batch_size=batch_size,
capacity=2048, num_threads=16, allow_smaller_final_batch=False,
min_after_dequeue=256)
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
IMAGE_HEIGHT = 299
IMAGE_WIDTH = 299
images = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3])
labels = tf.placeholder(dtype=tf.int32, shape=[None, 1])
weighted_level = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(sample_num / batch_size)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(images, num_classes=num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
print("label test")
print(labels)
print(logits)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = tf.squeeze(tf.one_hot(labels, num_classes), [1])
print(one_hot_labels)
print(logits)
weighted_onehot = tf.multiply(one_hot_labels, weighted_level)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
digits_loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_onehot, logits=logits)
reg_loss = tf.losses.get_regularization_loss()
total_loss = digits_loss + reg_loss
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# train_vars = []
# Now we can define the optimizer that takes on the learning rate
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
"InceptionResnetV2/Logits")
# RMSProp or Adam
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer, variables_to_train=train_vars)
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.metrics.accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
tf.summary.scalar('losses/Reg_Loss', reg_loss)
tf.summary.scalar('losses/Digit_Loss', digits_loss)
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
writer = tf.summary.FileWriter(filewriter_path)
writer.add_graph(tf.get_default_graph())
my_summary_op = tf.summary.merge_all()
def train_step(sess, train_op, global_step, imgs, lbls, weight):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed
for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: digit_loss: %.4f (%.2f sec/step)',
global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
saver_pretrain = tf.train.Saver(variables_to_restore)
saver_train = tf.train.Saver(train_vars)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
# Create a coordinator and run all QueueRunner objects
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
saver_pretrain.restore(sess, checkpoint_file)
start_time = time.time()
for step in range(int(num_steps_per_epoch * num_epochs)):
imgs, lbls, augs = sess.run([tf_images, tf_labels, tf_aug])
imgs, lbls = aug_parallel_v2(imgs, lbls, augs)
imgs = imgs[:, 50:349, 50:349, :]
imgs = 2*(imgs.astype(np.float32)) - 1
lbls = lbls.astype(np.int32)
weight = multiplier[lbls]
weight = np.array(weight).reshape((batch_size, 1))
# print(imgs[0, 0:10, 0:10, 0:2])
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
print('logits: \n', logits_value)
print('Probabilities: \n', probabilities_value)
print('predictions: \n', predictions_value)
print('Labels:\n:', labels_value)
# Log the summaries every 10 step.
if step % 20 == 0:
loss, global_step_count = train_step(sess, train_op, global_step, imgs, lbls, weight)
summaries = sess.run(my_summary_op, feed_dict={images: imgs, labels: lbls, weighted_level: weight})
writer.add_summary(summaries, global_step_count)
# sess.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, global_step, imgs, lbls, weight)
if step % 2000 == 0:
logging.info('Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
print('one batch time: ', time.time() - start_time)
start_time = time.time()
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
# Stop the threads
coord.request_stop()
# Wait for threads to stop
coord.join(threads)
sess.close()
if __name__ == '__main__':
run()
I am new here, and don't have enough reputation to post images.
Here are two links for the accuracy plot and losses plot. You can easily tell the regularization loss is in a dominant position.
This is a difficult question to answer. I can give some pointers though.
In general, when you try to minimize digits_loss, that is to fit your model to your data, you will slowly change the weights in your layers. To counter potential overfitting, a L2 regularization loss (the sum of the squares of all weights, reg_loss in your code) is generally added to the overall loss (total_loss in your code.) These two forces generally act against each other and if the balance is right, you train a good model.
In your case you're taking a network (resnet_v2) that was developed for 1,001 classes and try to predict 8,142 classes. No problem with that per se, but you're upsetting the balance. So I believe you need to override the default weight decay of 0.00004 for resnet v2 to some higher value, in this line (note only 3 zeros in the decimals for a 10x increase):
with slim.arg_scope( inception_resnet_v2_arg_scope( weight_decay = 0.0004 ) ):
A higher weight_decay parameter will force the L2 loss to decrease faster. The problem is that this number is just a guess, I have no idea what an ideal value would be. You need to experiment with multiple values and figure it out.
Related
I'm trying to follow this tutorial enter link description hereon transfer learning, I used my own dataset , and I'm trying to use MobileNet instead to inception , the problem is in the MobileNet models there are 3 checkpoint files:
mobilenet_v1_0.5_128.ckpt.data-00000-of-00001
mobilenet_v1_0.5_128.ckpt.index
mobilenet_v1_0.5_128.ckpt.meta
when I use one of them got this Error :
NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt.meta
[[Node: save/RestoreV2_139 = RestoreV2[dtypes=[DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2_139/tensor_names, save/RestoreV2_139/shape_and_slices)]]
import tensorflow as tf
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
from tensorflow.python.platform import tf_logging as logging
#from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
from models.research.slim.nets.mobilenet_v1 import mobilenet_v1, mobilenet_v1_arg_scope
import os
import time
import h5py
import numpy as np
slim = tf.contrib.slim
# ================ DATASET INFORMATION ======================
# State dataset directory where the tfrecord files are located
dataset_dir = 'C://Nassima//lymphoma//subs3'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log'
# State where your checkpoint file is
checkpoint_file = 'C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt.meta'
# State the image size you're resizing your images to. We will use the default inception size of 299.
#image_size = 299
#image_size = 128
# State the number of classes to predict:
num_classes = 3
# State the labels file and read it
labels_file = 'C://Nassima//lymphoma//subs3//labels.txt'
labels = open(labels_file, 'r')
# Create a dictionary to refer each label to their string name
labels_to_name = {}
for line in labels:
label, string_name = line.split(':')
string_name = string_name[:-1] # Remove newline
labels_to_name[int(label)] = string_name
print(labels_to_name)
# Create the file pattern of your TFRecord files so that it could be recognized later on
"""
file_pattern = 'flowers_%s_*.tfrecord'
"""
# Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
items_to_descriptions = {
'image': 'A 3-channel RGB coloured lymphoma image that is either CLL, FL, MCL.',
'label': 'A label that is as such -- 0:CLL, 1:FL, 2:MCL'
}
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 1
# State your batch size
#batch_size = 8
file_mean = "C://Nassima//lymphoma//subs3//train//mean.hdf5"
TRAINING_SET_SIZE = 41860
BATCH_SIZE = 128
IMAGE_SIZE = 144
IMAGE_RESIZE = 128
# Learning rate information and configuration (Up to you to experiment)
initial_learning_rate = 0.0002
learning_rate_decay_factor = 0.7
num_epochs_before_decay = 2
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
class _image_object: # image object from protobuf
def __init__(self):
self.image = tf.Variable([], dtype=tf.string)
self.height = tf.Variable([], dtype=tf.int64)
self.width = tf.Variable([], dtype=tf.int64)
self.filename = tf.Variable([], dtype=tf.string)
self.label = tf.Variable([], dtype=tf.int32)
def read_and_decode(filename_queue, mean):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example, features = {
"image/encoded": tf.FixedLenFeature([], tf.string),
"image/height": tf.FixedLenFeature([], tf.int64),
"image/width": tf.FixedLenFeature([], tf.int64),
"image/filename": tf.FixedLenFeature([], tf.string),
"image/class/label": tf.FixedLenFeature([], tf.int64),})
image_encoded = features["image/encoded"]
image_raw = tf.decode_raw(image_encoded, tf.float32)
image_object = _image_object()
#image_object.image = tf.image.resize_image_with_crop_or_pad(image_raw, IMAGE_SIZE, IMAGE_SIZE)
image_r = tf.reshape(image_raw, [IMAGE_SIZE, IMAGE_SIZE, 3])
#added
image_r = image_r - mean
image_r = tf.random_crop(image_r ,[IMAGE_RESIZE ,IMAGE_RESIZE ,3], seed = 0, name = None)
image_object.image = image_r
image_object.height = features["image/height"]
image_object.width = features["image/width"]
image_object.filename = features["image/filename"]
image_object.label = tf.cast(features["image/class/label"], tf.int64)
return image_object
def flower_input(mean, if_random = True, if_training = True):
if(if_training):
filenames = [os.path.join(dataset_dir, "lymphoma_train_0000%d-of-00005.tfrecord" % i) for i in range(0, 5)]
else:
filenames = [os.path.join(dataset_dir, "lymphoma_validation_0000%d-of-00005.tfrecord" % i) for i in range(0, 5)]
for f in filenames:
if not tf.gfile.Exists(f):
raise ValueError("Failed to find file: " + f)
filename_queue = tf.train.string_input_producer(filenames)
image_object = read_and_decode(filename_queue, mean)
image = tf.image.per_image_standardization(image_object.image)
# image = image_object.image
# image = tf.image.adjust_gamma(tf.cast(image_object.image, tf.float32), gamma=1, gain=1) # Scale image to (0, 1)
filename = image_object.filename
label = image_object.label
if(if_random):
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(TRAINING_SET_SIZE * min_fraction_of_examples_in_queue)
print("Filling queue with %d images before starting to train. " "This will take a few minutes." % min_queue_examples)
num_preprocess_threads = 1
image_batch, label_batch, filename_batch = tf.train.shuffle_batch(
[image, label, filename],
batch_size=BATCH_SIZE,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + 3 * BATCH_SIZE,
min_after_dequeue=min_queue_examples)
return image_batch, label_batch, filename_batch
else:
image_batch, label_batch, filename_batch = tf.train.batch(
[image, label, filename],
batch_size=BATCH_SIZE,
num_threads=1)
return image_batch, label_batch, filename_batch
"""
# ============== DATASET LOADING ======================
"""
def run():
# Create the log directory here. Must be done here otherwise import will activate this unneededly.
if not os.path.exists(log_dir):
os.mkdir(log_dir)
# ======================= TRAINING PROCESS =========================
# Now we start to construct the graph and build our model
with tf.Graph().as_default() as graph:
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
# ajouter le mean de l'image
hdf5_file = h5py.File(file_mean, "r")
# subtract the training mean
mm = hdf5_file["train_mean"][0, ...]
mm = mm[np.newaxis, ...]
# Total number of samples
mean = tf.convert_to_tensor(mm, np.float32)
# First create the dataset and load one batch
images, labels, _ = flower_input(mean, if_random=True, if_training=True)
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(TRAINING_SET_SIZE / BATCH_SIZE)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(mobilenet_v1_arg_scope()):
logits, end_points = mobilenet_v1(images, num_classes= num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
#exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
exclude = ['MobilenetV1/Logits', 'MobilenetV1/AuxLogits']
#exclude = ["MobilenetV1/Logits/Conv2d_1c_1x1"]
#exclude = []
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = slim.one_hot_encoding(labels, num_classes)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits)
total_loss = tf.losses.get_total_loss() # obtain the regularization losses as well
# Create the global step for monitoring the learning_rate and training.
global_step = get_or_create_global_step()
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# Now we can define the optimizer that takes on the learning rate
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer)
# State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
# Now finally create all the summaries you need to monitor and group them into one summary op.
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
my_summary_op = tf.summary.merge_all()
# Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
def train_step(sess, train_op, global_step):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
# Now we create a saver function that actually restores the variables from a checkpoint file in a sess
saver = tf.train.Saver(variables_to_restore)
saver = tf.train.import_meta_graph(checkpoint_file)
#added
def restore_fn(sess):
return saver.restore(sess, 'C://Users//hp//PycharmProjects//tfSlim/mobilenet_v1_0.5_128//mobilenet_v1_0.5_128.ckpt')
# Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
sv = tf.train.Supervisor(logdir=log_dir, summary_op=None, init_fn=restore_fn)
# Run the managed session
with sv.managed_session() as sess:
for step in range(num_steps_per_epoch * num_epochs):
# At the start of every epoch, show the vital information:
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy])
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels])
print
'logits: \n', logits_value
print
'Probabilities: \n', probabilities_value
print
'predictions: \n', predictions_value
print
'Labels:\n:', labels_value
# Log the summaries every 10 step.
if step % 10 == 0:
loss, _ = train_step(sess, train_op, sv.global_step)
summaries = sess.run(my_summary_op)
sv.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, sv.global_step)
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
# saver.save(sess, "./flowers_model.ckpt")
#sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
if __name__ == '__main__':
run()
and the error is
File "C:/Users/hp/PycharmProjects/tfSlim/lympho_mobileNet/train_lymphoma2.py", line 272, in <module>
run()
File "C:/Users/hp/PycharmProjects/tfSlim/lympho_mobileNet/train_lymphoma2.py", line 230, in run
sv = tf.train.Supervisor(logdir=log_dir, summary_op=None, init_fn=restore_fn)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\supervisor.py", line 300, in __init__
self._init_saver(saver=saver)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\supervisor.py", line 448, in _init_saver
saver = saver_mod.Saver()
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1218, in __init__
self.build()
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1227, in build
self._build(self._filename, build_save=True, build_restore=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 1263, in _build
build_save=build_save, build_restore=build_restore)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 729, in _build_internal
saveables = self._ValidateAndSliceInputs(names_to_saveables)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 582, in _ValidateAndSliceInputs
names_to_saveables = BaseSaverBuilder.OpListToDict(names_to_saveables)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 554, in OpListToDict
name)
ValueError: At least two variables have the same name: MobilenetV1/Conv2d_7_depthwise/BatchNorm/gamma
I think because of the excluded layers or the instruction
tf.train.import_meta_graph(checkpoint_file)
You're loading the meta file, while you should be providing just the path to mobilenet_v1_0.5_128.ckpt
I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)
After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.
The problem is that you are feeding raw images in the LSTM, so it is very difficult for it to extract any useful information. The CRNN paper first uses a series of convolutional layers to extract features from the images, and then these are fed into the LSTM.
I am trying to build a linear classifier with CIFAR - 100 using TensorFlow. I got the code from Martin Gorner's MNIST tutorial and change a bit. When I run this code, tensorflow does not training (code is running but accuracy remains 1.0 and loss(cross entropy remains as 4605.17), I don't know what is wrong, I am actually newbie to TF any help is appreciated.
import pickle
import numpy as np
import os
import tensorflow as tf
from tensorflow.python.framework import tensor_util
import math
#imports data
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
return dict
cifar100_test = {}
cifar100_train = {}
labelMap = {}
labelNames = {}
# Load the raw CIFAR-10 data.
cifar100_test = unpickle('dataset/cifar-100-python/test')
cifar100_train = unpickle('dataset/cifar-100-python/train')
labelMap = unpickle('dataset/cifar-100-python/meta')
#tr for training data and te for testing data, X is data, Y is label
Xtr = cifar100_train[b'data']
Yr = cifar100_train[b'fine_labels']
Xte = cifar100_test[b'data']
Ye = cifar100_test[b'fine_labels']
classNames = labelMap[b'fine_label_names']
num_train = Xtr.shape[0]
num_test = Xte.shape[0]
num_class = len(classNames)
Ytr = np.zeros([num_train, num_class])
Yte = np.zeros([num_test, num_class])
Ytr[0:num_train, Yr[0:num_train]] = 1
Yte[0:num_test, Ye[0:num_test]] = 1
# As a sanity check, we print out the size of the training and test data.
print('Train data shape:', Xtr.shape)
print('Train Label shape:', Ytr.shape)
print('Test data shape:', Xte.shape)
print('Test Label shape:', Yte.shape)
print('Name of Predicted Class:', classNames[0]) #indice of the label name is the indice of the class.
Xtrain = Xtr#[:1000]
Xtest = Xte#[:100]
Ytrain = Ytr#[:1000]
Ytest = Yte#[:100]
print('Train data shape:', Xtrain.shape)
print('Train Label shape:', Ytrain.shape)
print('Test data shape:', Xtest.shape)
print('Test Label shape:', Ytest.shape)
Xtrain = np.reshape(Xtrain,(50000, 32, 32, 3)).transpose(0,1,2,3).astype(float)
Xtest = np.reshape(Xtest,(10000, 32, 32, 3)).transpose(0,1,2,3).astype(float)
Xbatches = np.split(Xtrain, 500); #second number is # of batches
Ybatches = np.split(np.asarray(Ytrain), 500);
XtestB = np.split(Xtest, 100);
YtestB = np.split(Ytest, 100);
print('X # of batches:', len(Xbatches))
print('Y # of batches:', len(Ybatches))
# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
X = tf.placeholder(tf.float32, [100, 32, 32, 3])
# correct answers will go here
Y_ = tf.placeholder(tf.float32, [100, 100])
# weights W[784, 10] 784=28*28
W = tf.Variable(tf.zeros([3072, 100]))
# biases b[10]
b = tf.Variable(tf.zeros([100]))
# flatten the images into a single line of pixels
# -1 in the shape definition means "the only possible dimension that will preserve the number of elements"
XX = tf.reshape(X, [-1, 3072])
# The model
Y = tf.nn.softmax(tf.matmul(XX, W) + b)
# loss function: cross-entropy = - sum( Y_i * log(Yi) )
# Y: the computed output vector
# Y_: the desired output vector
# cross-entropy
# log takes the log of each element, * multiplies the tensors element by element
# reduce_mean will add all the components in the tensor
# so here we end up with the total cross-entropy for all images in the batch
cross_entropy = -tf.reduce_mean(Y_ * tf.log(Y)) * 1000.0 # normalized for batches of 100 images,
# *10 because "mean" included an unwanted division by 10
# accuracy of the trained model, between 0 (worst) and 1 (best)
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# training, learning rate = 0.005
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# init
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for i in range(500):
# the backpropagation training step
t, Loss = sess.run([train_step, cross_entropy], feed_dict={X: Xbatches[i], Y_: Ybatches[i]})
print(Loss)
print(i)
for i in range(100):
print('accuracy:', sess.run(accuracy, feed_dict={X: XtestB[i], Y_: YtestB[i]}))
You compute the accuracy a hundred times after the training process is completed. Nothing will change there. You should place your print('accuracy:'....) within the for loop in which you perform the backpropagation:
for i in range(500):
# the backpropagation training step
t, Loss = sess.run([train_step, cross_entropy], feed_dict={X: Xbatches[i], Y_: Ybatches[i]})
print(Loss)
print(i)
print('accuracy:', sess.run(accuracy, feed_dict={X: XtestB[i], Y_: YtestB[i]}))
Sorry for the post it turns out that it is a basic mistake.
I changed following;
Ytr[0:num_train, Yr[0:num_train]] = 1
Yte[0:num_test, Ye[0:num_test]] = 1
with
Ytr[range(num_train), Yr_temp[range(num_train)]] = 1
Yte[range(num_test), Ye_temp[range(num_test)]] = 1
First one make all values 1, but I just wanted to make indice of the true class 1 and other elements 0. Thanks for your time.
I only summary my loss as 'xentropy_mean' in training() ,but in tensorboard ,I had not find the 'xentropy_mean' chart but many other charts I did not defined. I don't know where I wrote wrong, and what's the matter indeed. Is it because I use thread in my code? If I don't use thread, how should I wrote it?
The tensorboard screenshot
There are 6 charts under the queue,I don't know what are the meanings either
I create the model in the file below
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow.python.platform
import tensorflow as tf
# The MNIST dataset has 10 classes, representing the digits 0 through 9.
NUM_CLASSES = 16
# The MNIST images are always 28x28 pixels.
IMAGE_SIZE = 28
IMAGE_PIXELS = 784
def inference(images, hidden1_units, hidden2_units):
"""Build the MNIST model up to where it may be used for inference.
Args:
images: Images placeholder, from inputs().
hidden1_units: Size of the first hidden layer.
hidden2_units: Size of the second hidden layer.
Returns:
softmax_linear: Output tensor with the computed logits.
"""
# Hidden 1
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
name='weights')
biases = tf.Variable(tf.zeros([hidden1_units]),
name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
# Hidden 2
with tf.name_scope('hidden2'):
weights = tf.Variable(
tf.truncated_normal([hidden1_units, hidden2_units],
stddev=1.0 / math.sqrt(float(hidden1_units))),
name='weights')
biases = tf.Variable(tf.zeros([hidden2_units]),
name='biases')
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
# Linear
with tf.name_scope('softmax_linear'):
weights = tf.Variable(
tf.truncated_normal([hidden2_units, NUM_CLASSES],
stddev=1.0 / math.sqrt(float(hidden2_units))),
name='weights')
biases = tf.Variable(tf.zeros([NUM_CLASSES]),
name='biases')
logits = tf.matmul(hidden2, weights) + biases
return logits
def loss(logits, labels):
batch_size = tf.size(labels)
#print('batch size %d' %(batch_size))
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size), 1)
concated = tf.concat(1, [indices, labels])
#print('Done2')
onehot_labels = tf.sparse_to_dense(
concated, tf.pack([batch_size, 16]), 1.0, 0.0)
#print('Done1')
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
onehot_labels,
name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
tf.summary.scalar(loss.op.name, loss)
return loss
def training(loss, learning_rate):
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
global_step=tf.Variable(0,name='global_step',trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
def evaluation(logits, labels):
correct = tf.nn.in_top_k(logits, labels, 1)
# Return the number of true entries.
return tf.reduce_sum(tf.cast(correct, tf.int32))
and train the model in this file:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import time
import numpy as np
import tensorflow as tf
import mnist
# Basic model parameters as external flags.
#FLAGS = None
# Constants used for dealing with the files, matches convert_to_records.
TRAIN_FILE = 'train.tfrecords'
VALIDATION_FILE = 'validation.tfrecords'
TEST_FILE='test.tfrecords'
flags = tf.app.flags
FLAGS = flags.FLAGS
#FLAGS = None
flags.DEFINE_string('train_dir', '/home/queenie/image2tfrecord/tfrecords-28-gray/', 'Directory to put the training data.')
flags.DEFINE_string('filename', 'train.tfrecords', 'Directory to put the training data.')
flags.DEFINE_integer('batch_size', 100, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('num_epochs', None, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_integer('hidden1', 128,'balabala')
flags.DEFINE_integer('hidden2', 32,'balabala')
flags.DEFINE_integer('learning_rate', 0.01,'balabala')
flags.DEFINE_integer('max_steps', 50000,'balabala')
def placeholder_inputs(batch_size):
images_placeholder=tf.placeholder(tf.float32,shape=(batch_size,mnist.IMAGE_PIXELS))
labels_placeholder=tf.placeholder(tf.int32,shape=(batch_size))
return images_placeholder,labels_placeholder
def fill_feed_dict(images_feed,labels_feed,images_pl,labels_pl):
feed_dict={
images_pl:images_feed,
labels_pl:labels_feed,
}
return feed_dict
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'image_raw': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.int64),
})
# Convert from a scalar string tensor (whose single string has
# length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
# [mnist.IMAGE_PIXELS].
image = tf.decode_raw(features['image_raw'], tf.uint8)
image.set_shape([mnist.IMAGE_PIXELS])
# OPTIONAL: Could reshape into a 28x28 image and apply distortions
# here. Since we are not applying any distortions in this
# example, and the next step expects the image to be flattened
# into a vector, we don't bother.
# Convert from [0, 255] -> [-0.5, 0.5] floats.
image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
# Convert label from a scalar uint8 tensor to an int32 scalar.
label = tf.cast(features['label'], tf.int32)
return image, label
def do_eval(sess,eval_correct):
true_count=0
for step in xrange(FLAGS.batch_size):
#print(sess.run(eval_correct))
true_count+=sess.run(eval_correct)
precision=float(true_count)/FLAGS.batch_size/FLAGS.batch_size
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' %
(FLAGS.batch_size, true_count, precision))
return precision
def inputs(train, batch_size, num_epochs):
if not num_epochs: num_epochs = None
if train=='train':
filename=os.path.join(FLAGS.train_dir,TRAIN_FILE)
elif train=='validation':
filename=os.path.join(FLAGS.train_dir,VALIDATION_FILE)
else:
filename=os.path.join(FLAGS.train_dir,TEST_FILE)
# filename = os.path.join(FLAGS.train_dir,
# TRAIN_FILE if train else VALIDATION_FILE)
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=None)
# Even when reading in multiple threads, share the filename
# queue.
image, label = read_and_decode(filename_queue)
# Shuffle the examples and collect them into batch_size batches.
# (Internally uses a RandomShuffleQueue.)
# We run this in two threads to avoid being a bottleneck.
images, sparse_labels = tf.train.shuffle_batch(
[image, label], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=1000)
return images, sparse_labels
def run_training():
with tf.Graph().as_default():
# Build a Graph that computes predictions from the inference model.
images, labels = inputs(train='train', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_valid,labels_valid=inputs(train='validation', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
images_test,labels_test=inputs(train='test', batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
logits = mnist.inference(images,
FLAGS.hidden1,
FLAGS.hidden2)
# Add to the Graph the loss calculation.
valid_prediction=mnist.inference(images_valid,FLAGS.hidden1,FLAGS.hidden2)
test_prediction=mnist.inference(images_test,FLAGS.hidden1,FLAGS.hidden2)
loss = mnist.loss(logits, labels)
# Add to the Graph operations that train the model.
train_op = mnist.training(loss, FLAGS.learning_rate)
eval_correct=mnist.evaluation(logits,labels)
eval_correct_valid=mnist.evaluation(valid_prediction,labels_valid)
eval_correct_test=mnist.evaluation(test_prediction,labels_test)
summary_op=tf.merge_all_summaries()
# The op for initializing the variables.
init_op = tf.group(tf.initialize_all_variables(),
tf.initialize_local_variables())
saver = tf.train.Saver()
# Create a session for running operations in the Graph.
sess = tf.Session()
# Initialize the variables (the trained variables and the
# epoch counter).
sess.run(init_op)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
train_precision=0
validation_precision=0
test_precision=0
#while not coord.should_stop():
while not coord.should_stop():
start_time = time.time()
_, loss_value,images_see,labels_see = sess.run([train_op, loss,images,labels])
#print('run done')
duration = time.time() - start_time
# Print an overview fairly often.
if step % 100 == 0:
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
duration))
precision_tr=do_eval(sess,eval_correct)
summary_str=sess.run(summary_op)
summary_writer.add_summary(summary_str,step)
if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
print('Train:')
do_eval(sess,eval_correct)
print('Validation:')
do_eval(sess,eval_correct_valid)
print('Test:')
do_eval(sess,eval_correct_test)
step += 1
except tf.errors.OutOfRangeError:
print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
finally:
# When done, ask the threads to stop.
coord.request_stop()
# Wait for threads to finish.
coord.join(threads)
sess.close()
run_training()
then I get the tensorboard like these,6 charts about queue.
The tensorboard screenshot
The queue charts you are seeing are created by default from shuffle_batch and friends, and can be used to monitor the performance of your input pipeline (you'll ideally want all the queues to stay at capacity, as that means your GPU isn't blocking on input reading).
I don't understand why your summary isn't showing in tensorboard. Can I get more information?
In the code below l2 surprisingly returns the same value as l1, but since the optimizer is being requested in the list before l2, I expected the loss to be the new loss after training. Can I not request multiple values at the same time from the graph and expect consistent output?
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=[None, 10])
y = tf.placeholder(tf.float32, shape=[None, 2])
weight = tf.Variable(tf.random_uniform((10, 2), dtype=tf.float32))
loss = tf.nn.sigmoid_cross_entropy_with_logits(tf.matmul(x, weight), y)
optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
with tf.Session() as sess:
tf.initialize_all_variables().run()
X = np.random.rand(1, 10)
Y = np.array([[0, 1]])
# Evaluate loss before running training step
l1 = sess.run([loss], feed_dict={x: X, y: Y})[0][0][0]
print(l1) # 3.32393
# Running the training step
_, l2 = sess.run([optimizer, loss], feed_dict={x: X, y: Y})
print(l2[0][0]) # 3.32393 -- didn't change?
# Evaluate loss again after training step as sanity check
l3 = sess.run([loss], feed_dict={x: X, y: Y})[0][0][0]
print(l3) # 2.71041
No - the order in which you request them in the list has no effect on the evaluation order. For side-effect-having operations such as the optimizer, if you want to guarantee a specific ordering, you need to enforce it using with_dependencies or similar control-flow constructs. In general, ignoring side-effects, TensorFlow will return results to you by grabbing the node from the graph as soon as it's computed - and, obviously, the loss is computed before the optimizer, since the optimizer requires the loss as one of its input. (Remember that 'loss' is not a variable; it's a tensor; so it's not actually affected by the optimizer step.)
sess.run([loss, optimizer], ...)
and
sess.run([optimizer, loss], ...)
are equivalent.
As Dave points out, the order of arguments to Session.run() has no effect on the order of evaluation, and the loss tensor in your example does not have a dependency on the optimizer op. To add a dependency, you could use tf.control_dependencies() to add an explicit dependency on the optimizer running before fetching the loss:
with tf.control_dependencies([optimizer]):
loss_after_optimizer = tf.identity(loss)
_, l2 = sess.run([optimizer, loss_after_optimizer], feed_dict={x: X, y: Y})
I've tested logistic regression implemented in tensorflow with three ways of session.run:
all together
res1, res2, res3 = sess.run([op1, op2, op3])
separately
res1 = sess.run(op1)
res2 = sess.run(op2)
res3 = sess.run(op3)
with dependencies
with tf.control_dependencies([op1]):
op2_after = tf.identity(op1)
op3_after = tf.identity(op1)
res1,res2,res3 = session.run([op1, op2_after, op3_after])
set batch size as 10000, the result is:
1: 0.05+ secs < 2: 0.11+ secs < 3: 0.25+ secs
The main difference between 1 and 3 is only one mini-batch. It may not worth it to use 3 instead of 1.
Here is the test code (it is an LR example written by someone else...).
Here is the data
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 2 13:38:14 2017
#author: inse7en
"""
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
import time
pickle_file = '/Users/inse7en/Downloads/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
# This is to expedite the process
train_subset = 10000
# This is a good beta value to start with
beta = 0.01
graph = tf.Graph()
with graph.as_default():
# Input data.
# They're all constants.
tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
tf_train_labels = tf.constant(train_labels[:train_subset])
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables
# They are variables we want to update and optimize.
weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
biases = tf.Variable(tf.zeros([num_labels]))
# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
# Original loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Loss function using L2 Regularization
regularizer = tf.nn.l2_loss(weights)
loss = tf.reduce_mean(loss + beta * regularizer)
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
num_steps = 50
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
with tf.Session(graph=graph) as session:
# This is a one-time operation which ensures the parameters get initialized as
# we described in the graph: random weights for the matrix, zeros for the
# biases.
tf.initialize_all_variables().run()
print('Initialized')
for step in range(num_steps):
# Run the computations. We tell .run() that we want to run the optimizer,
# and get the loss value and the training predictions returned as numpy
# arrays.
#_, l, predictions = session.run([optimizer, loss, train_prediction])
start_time = time.time()
with tf.control_dependencies([optimizer]):
loss_after_optimizer = tf.identity(loss)
predictions_after = tf.identity(train_prediction)
regularizers_after = tf.identity(regularizer)
_, l, predictions,regularizers = session.run([optimizer, loss_after_optimizer, predictions_after, regularizers_after])
print("--- with dependencies: %s seconds ---" % (time.time() - start_time))
#start_time = time.time()
#opt = session.run(optimizer)
#l = session.run(loss)
#predictions = session.run(train_prediction)
#regularizers = session.run(regularizer)
#print("--- run separately: %s seconds ---" % (time.time() - start_time))
#start_time = time.time()
#_, l, predictions,regularizers = session.run([optimizer, loss, train_prediction, regularizer])
#print("--- all together: %s seconds ---" % (time.time() - start_time))
#if (step % 100 == 0):
#print('Loss at step {}: {}'.format(step, l))
#print('Training accuracy: {:.1f}'.format(accuracy(predictions,
#train_labels[:train_subset, :])))
# Calling .eval() on valid_prediction is basically like calling run(), but
# just to get that one numpy array. Note that it recomputes all its graph
# dependencies.
# You don't have to do .eval above because we already ran the session for the
# train_prediction
#print('Validation accuracy: {:.1f}'.format(accuracy(valid_prediction.eval(),
#valid_labels)))
#print('Test accuracy: {:.1f}'.format(accuracy(test_prediction.eval(), test_labels)))
#print(regularizer)