I noticed a big difference in speed if I load my training data into memory and feed it into the graph as a numpy array vs using a shuffle batch of the same size, my data has ~1000 instances.
Using memory 1000 iterations takes less than a few seconds but using a shuffle batch it takes almost 10 minutes. I get the shuffle batch should be a bit slower but this seems way too slow. Why is this?
Added a bounty. Any suggestions on how to make shuffled mini-batches faster?
Here is the training data: Link to bounty_training.csv (pastebin)
Here is my code:
shuffle_batch
import numpy as np
import tensorflow as tf
data = np.loadtxt('bounty_training.csv',
delimiter=',',skiprows=1,usecols = (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14))
filename = "test.tfrecords"
with tf.python_io.TFRecordWriter(filename) as writer:
for row in data:
features, label = row[:-1], row[-1]
example = tf.train.Example()
example.features.feature['features'].float_list.value.extend(features)
example.features.feature['label'].float_list.value.append(label)
writer.write(example.SerializeToString())
def read_and_decode_single_example(filename):
filename_queue = tf.train.string_input_producer([filename],
num_epochs=None)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'label': tf.FixedLenFeature([], np.float32),
'features': tf.FixedLenFeature([14], np.float32)})
pdiff = features['label']
avgs = features['features']
return avgs, pdiff
avgs, pdiff = read_and_decode_single_example(filename)
n_features = 14
batch_size = 1000
hidden_units = 7
lr = .001
avgs_batch, pdiff_batch = tf.train.shuffle_batch(
[avgs, pdiff], batch_size=batch_size,
capacity=5000,
min_after_dequeue=2000)
X = tf.placeholder(tf.float32,[None,n_features])
Y = tf.placeholder(tf.float32,[None,1])
W = tf.Variable(tf.truncated_normal([n_features,hidden_units]))
b = tf.Variable(tf.zeros([hidden_units]))
Wout = tf.Variable(tf.truncated_normal([hidden_units,1]))
bout = tf.Variable(tf.zeros([1]))
hidden1 = tf.matmul(X,W) + b
pred = tf.matmul(hidden1,Wout) + bout
loss = tf.reduce_mean(tf.squared_difference(pred,Y))
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for step in range(1000):
x_, y_ = sess.run([avgs_batch,pdiff_batch])
_, loss_val = sess.run([optimizer,loss],
feed_dict={X: x_, Y: y_.reshape(batch_size,1)} )
if step % 100 == 0:
print(loss_val)
coord.request_stop()
coord.join(threads)
Full batch via numpy array
"""
avgs and pdiff loaded into numpy arrays first...
Same model as above
"""
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
for step in range(1000):
_, loss_value = sess.run([optimizer,loss],
feed_dict={X: avgs,Y: pdiff.reshape(n_instances,1)} )
In this case, you're running a session 3 times per step - once in avgs_batch.eval, once for pdiff_batch.eval, and once for the actual sess.run call. That doesn't explain the magnitude of the slow down, but it's definitely something you should keep in mind. At the very least the first two eval calls should be combined to one sess.run call.
I suspect most of the slow-down is coming from use of TFRecordReader. I don't pretend to understand the inner workings of tensorflow, but you might find my answer here helpful.
Summary
create minimal data associated with each example, i.e. image filenames, ids rather than entire images;
convert to tensorflow ops with tensorflow.python.framework.ops.convert_to_tensor;
use tf.train.slice_input_producer to get a tensor for a single example;
do some preprocessing on individual examples - e.g. load images from filenames;
batch them together using tf.train.batch to group them up.
The trick is instead of feeding single examples into shuffle_batch you feed an n+1 dimensional tensor of examples to it with enqueue_many=True. I found this thread that was very helpful:
TFRecordReader seems extremely slow , and multi-threads reading not working
def get_batch(batch_size):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
batch_list = []
for i in range(batch_size):
batch_list.append(serialized_example)
return [batch_list]
batch_serialized_example = tf.train.shuffle_batch(
get_batch(batch_size), batch_size=batch_size,
capacity=100*batch_size,
min_after_dequeue=batch_size*10,
num_threads=1,
enqueue_many=True)
features = tf.parse_example(
batch_serialized_example,
features={
'label': tf.FixedLenFeature([], np.float32),
'features': tf.FixedLenFeature([14], np.float32)})
batch_pdiff = features['label']
batch_avgs = features['features']
...
When using queues to get the data, you shouldn't use feed_dict. Instead, make your graph depend directly on the input data, that is:
remove the X and Y PlaceHolders
use your feature batch directly
hidden1 = tf.matmul(avgs_batch,W) + b
similarly, use the label batch (pdiff_batch) instead of Y when computing the loss
finally, just keep the second session.run to compute the loss directly, and without using feed_dict
# x_, y_ = sess.run([avgs_batch,pdiff_batch])
# _, loss_val = sess.run([optimizer,loss],
feed_dict={X: x_, Y: y_.reshape(batch_size,1)} )
_, loss_val = sess.run([optimizer,loss])
Related
I am training GANs model. For loading the dataset, I am using Dataset API of TensorFlow.
# train_dataset has image and label. z_train dataset has noise (z).
train_dataset = tf.data.TFRecordDataset(train_file)
z_train = tf.data.Dataset.from_tensor_slices(tf.random_uniform([total_training_samples, seq_length, z_dim],
minval=0, maxval=1, dtype=tf.float32))
train_dataset = tf.data.Dataset.zip((train_dataset, z_train))
Creating Iterator:
iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
Using the iterator:
(img, label), z = iter.get_next()
train_init_op = iter.make_initializer(train_dataset)
While training the GAN in session:
Training Discriminator first:
_, disc_loss = sess.run([disc_optim, disc_loss])
then training Generator:
_, gen_loss = sess.run([gen_optim, gen_loss])
Here is the catch. Since, I am using label as condition (CGAN) in both, discriminator and generator graph, using two sess.run produces two different set of batch of label during the same run of batch.
for epoch in range(num_of_epochs):
sess.run([tf.global_variables_initializer(), train_init_op.initializer])
for batch in range(num_of_batches):
_, disc_loss = sess.run([disc_optim, disc_loss])
_, gen_loss = sess.run([gen_optim, gen_loss])
Since, I have to feed the same batch of label in the generator's session run as in discriminator's session run, how shall I prevent Dataset API to produce two different batches in the same loop of a batch?
Note: I am using TensorFlow v1.9
Thanks in advance.
You can create 2 iterators for the same dataset. If you need to shuffle the dataset, you can even do that by specifying the seed as a tensor. See example below.
import tensorflow as tf
seed_ts = tf.placeholder(tf.int64)
ds = tf.data.Dataset.from_tensor_slices([1,2,3,4,5]).shuffle(5, seed=seed_ts, reshuffle_each_iteration=True)
it1 = ds.make_initializable_iterator()
it2 = ds.make_initializable_iterator()
input1 = it1.get_next()
input2 = it2.get_next()
with tf.Session() as sess:
for ep in range(10):
sess.run(it1.initializer, feed_dict={seed_ts: ep})
sess.run(it2.initializer, feed_dict={seed_ts: ep})
print("Epoch" + str(ep))
for i in range(5):
x = sess.run(input1)
y = sess.run(input2)
print([x, y])
I am finetuning InceptionResnetV2 on TensorFlow. When training, the regularization loss keep linearly increasing and even much larger than cross entropy loss in the later stage of training. I have checked the training procedure, and make sure I am optimizing the cross entropy loss and L2 loss combined.
Is there anyone explain this weird thing a little bit? Any feedback is appreciated.
Here is the code and some TensorBoard plots.
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
from preprocessing import aug_parallel_v2
import numpy as np
slim = tf.contrib.slim
# total training data number
sample_num = 625020
data_path = 'iNaturalist_train.tfrecords'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log_v5'
# tensorboard visualization path
filewriter_path = './filewriter_v5_Logits'
# State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
checkpoint_save_addr = './log_v5/fine-tuning_v5.ckpt'
# State the image size you're resizing your images to. We will use the default inception size of 299.
image_size = 299
# State the number of classes to predict:
num_classes = 8142
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 5
# State your batch size
batch_size = 60
# Learning rate information and configuration
initial_learning_rate = 0.0005
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 2
# put weight on different classes inversely proportional
# to total number of their image samples
label_count = np.loadtxt('label_count.txt', dtype=int)
inverse = lambda t: 1 / t
vfunc = np.vectorize(inverse)
multiplier = vfunc(label_count)
multiplier /= np.mean(multiplier)
def run():
if not os.path.exists(log_dir):
os.mkdir(log_dir)
feature = {'train/height': tf.FixedLenFeature([], tf.int64),
'train/width': tf.FixedLenFeature([], tf.int64),
'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64),
'train/sup_label': tf.FixedLenFeature([], tf.int64),
'train/aug_level': tf.FixedLenFeature([], tf.int64)}
# create a list of file names
filename_queue = tf.train.string_input_producer([data_path], num_epochs=None)
print(filename_queue)
reader = tf.TFRecordReader()
_, tfrecord_serialized = reader.read(filename_queue)
features = tf.parse_single_example(tfrecord_serialized, features=feature)
# Convert the image data from string back to the numbers
height = tf.cast(features['train/height'], tf.int64)
width = tf.cast(features['train/width'], tf.int64)
# change this line for your TFrecord version
tf_image = tf.image.decode_jpeg(features['train/image'])
tf_label = tf.cast(features['train/label'], tf.int32)
aug_level = tf.cast(features['train/aug_level'], tf.int32)
# tf_sup_label = tf.cast(features['train/sup_label'], tf.int64)
tf_image = tf.reshape(tf_image, tf.stack([height, width, 3]))
tf_label = tf.reshape(tf_label, [1])
aug_level = tf.reshape(aug_level, [1])
resized_image = tf.image.resize_images(images=tf_image, size=tf.constant([400, 400]), method=2)
resized_image = tf.cast(resized_image, tf.uint8)
tf_images, tf_labels, tf_aug = tf.train.shuffle_batch([resized_image, tf_label, aug_level], batch_size=batch_size,
capacity=2048, num_threads=16, allow_smaller_final_batch=False,
min_after_dequeue=256)
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
IMAGE_HEIGHT = 299
IMAGE_WIDTH = 299
images = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3])
labels = tf.placeholder(dtype=tf.int32, shape=[None, 1])
weighted_level = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(sample_num / batch_size)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(images, num_classes=num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
print("label test")
print(labels)
print(logits)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = tf.squeeze(tf.one_hot(labels, num_classes), [1])
print(one_hot_labels)
print(logits)
weighted_onehot = tf.multiply(one_hot_labels, weighted_level)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
digits_loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_onehot, logits=logits)
reg_loss = tf.losses.get_regularization_loss()
total_loss = digits_loss + reg_loss
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# train_vars = []
# Now we can define the optimizer that takes on the learning rate
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
"InceptionResnetV2/Logits")
# RMSProp or Adam
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer, variables_to_train=train_vars)
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.metrics.accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
tf.summary.scalar('losses/Reg_Loss', reg_loss)
tf.summary.scalar('losses/Digit_Loss', digits_loss)
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
writer = tf.summary.FileWriter(filewriter_path)
writer.add_graph(tf.get_default_graph())
my_summary_op = tf.summary.merge_all()
def train_step(sess, train_op, global_step, imgs, lbls, weight):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed
for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: digit_loss: %.4f (%.2f sec/step)',
global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
saver_pretrain = tf.train.Saver(variables_to_restore)
saver_train = tf.train.Saver(train_vars)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
# Create a coordinator and run all QueueRunner objects
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
saver_pretrain.restore(sess, checkpoint_file)
start_time = time.time()
for step in range(int(num_steps_per_epoch * num_epochs)):
imgs, lbls, augs = sess.run([tf_images, tf_labels, tf_aug])
imgs, lbls = aug_parallel_v2(imgs, lbls, augs)
imgs = imgs[:, 50:349, 50:349, :]
imgs = 2*(imgs.astype(np.float32)) - 1
lbls = lbls.astype(np.int32)
weight = multiplier[lbls]
weight = np.array(weight).reshape((batch_size, 1))
# print(imgs[0, 0:10, 0:10, 0:2])
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
print('logits: \n', logits_value)
print('Probabilities: \n', probabilities_value)
print('predictions: \n', predictions_value)
print('Labels:\n:', labels_value)
# Log the summaries every 10 step.
if step % 20 == 0:
loss, global_step_count = train_step(sess, train_op, global_step, imgs, lbls, weight)
summaries = sess.run(my_summary_op, feed_dict={images: imgs, labels: lbls, weighted_level: weight})
writer.add_summary(summaries, global_step_count)
# sess.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, global_step, imgs, lbls, weight)
if step % 2000 == 0:
logging.info('Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
print('one batch time: ', time.time() - start_time)
start_time = time.time()
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
# Stop the threads
coord.request_stop()
# Wait for threads to stop
coord.join(threads)
sess.close()
if __name__ == '__main__':
run()
I am new here, and don't have enough reputation to post images.
Here are two links for the accuracy plot and losses plot. You can easily tell the regularization loss is in a dominant position.
This is a difficult question to answer. I can give some pointers though.
In general, when you try to minimize digits_loss, that is to fit your model to your data, you will slowly change the weights in your layers. To counter potential overfitting, a L2 regularization loss (the sum of the squares of all weights, reg_loss in your code) is generally added to the overall loss (total_loss in your code.) These two forces generally act against each other and if the balance is right, you train a good model.
In your case you're taking a network (resnet_v2) that was developed for 1,001 classes and try to predict 8,142 classes. No problem with that per se, but you're upsetting the balance. So I believe you need to override the default weight decay of 0.00004 for resnet v2 to some higher value, in this line (note only 3 zeros in the decimals for a 10x increase):
with slim.arg_scope( inception_resnet_v2_arg_scope( weight_decay = 0.0004 ) ):
A higher weight_decay parameter will force the L2 loss to decrease faster. The problem is that this number is just a guess, I have no idea what an ideal value would be. You need to experiment with multiple values and figure it out.
I'm trying to train a model on a .csv dataset (5008 columns, 533 rows).
I'm using a textreader to parse the data into two tensors, one holding the data to train on [example] and one holding the correct labels [label]:
def read_my_file_format(filename_queue):
reader = tf.TextLineReader()
key, record_string = reader.read(filename_queue)
record_defaults = [[0.5] for row in range(5008)]
#Left out most of the columns for obvious reasons
col1, col2, col3, ..., col5008 = tf.decode_csv(record_string, record_defaults=record_defaults)
example = tf.stack([col1, col2, col3, ..., col5007])
label = col5008
return example, label
def input_pipeline(filenames, batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs, shuffle=True)
example, label = read_my_file_format(filename_queue)
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch([example, label], batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
This part is working, when executing something like:
with tf.Session() as sess:
ex_b, l_b = input_pipeline(["Tensorflow_vectors.csv"], 10, 1)
print("Test: ",ex_b)
my result is Test: Tensor("shuffle_batch:0", shape=(10, 5007), dtype=float32)
So far this seems fine to me. Next I've created a simple model consising of two hidden layers (512 and 256 nodes respectively). Where things go wrong is when I'm trying to train the model:
batch_x, batch_y = input_pipeline(["Tensorflow_vectors.csv"], batch_size)
_, cost = sess.run([optimizer, cost], feed_dict={x: batch_x.eval(), y: batch_y.eval()})
I've based this approach on this example that uses the MNIST database.
However, when I'm executing this, even when I'm just using batch_size = 1, Tensorflow just hangs. If I leave out the .eval() functions that should get the actual data from the tensors, I get the following response:
TypeError: The value of a feed cannot be a tf.Tensor object. Acceptable feed values include Python scalars, strings, lists, or numpy ndarrays.
Now this I can understand, but I don't understand why the program hangs when I do include the .eval() function and I don't know where I could find any information about this issue.
EDIT: I included the most recent version of my entire script here. The program still hangs even though I implemented (as far as I know correctly) the solution that was offered by vijay m
As the error says, you are trying to feed a tensor to feed_dict. You have defined a input_pipeline queue and you cant pass it as feed_dict. The proper way for the data to be passed to the model and train is shown in the code below:
# A queue which will return batches of inputs
batch_x, batch_y = input_pipeline(["Tensorflow_vectors.csv"], batch_size)
# Feed it to your neural network model:
# Every time this is called, it will pull data from the queue.
logits = neural_network(batch_x, batch_y, ...)
# Define cost and optimizer
cost = ...
optimizer = ...
# Evaluate the graph on a session:
with tf.Session() as sess:
init_op = ...
sess.run(init_op)
# Start the queues
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
# Loop through data and train
for ( loop through steps ):
_, cost = sess.run([optimizer, cost])
coord.request_stop()
coord.join(threads)
I am trying to replicate the structure used in the TensorBoard MNIST example from the recent 2017 dev summit (code found here). In it, feed_dict's are used to alternate between training and validation sets; however, they use the very non-transparent mnist.train.next_batch, which makes it really difficult to iterate your own off of.
Admittedly, this may also be because I'm struggling to understand the queueing implementation in Tensorflow, and explicit examples seem to be in short supply, especially for TF > v1.0.
I've made my own attempt at an image-classifying CNN based on various examples I stumbled across. Originally I had it working with just training data by storing the data in pre-loaded variables (its a small data set). I assumed it would be easier to get the train/valid swap working via feeding data from filenames so I tried to change it to that.
Between changing the format and trying to implement the feed_dict train/valid structure, I get the following -
Error: "You must feed a value for placeholder tensor 'input/Placeholder_2' with dtype string".
Any tips as to how to get it working or further explanation as to how the slicer/train.batch/QueueRunner actually work together would be of great help, as I have found the Tensorflow tutorial to be lacking in terms of explaining the basic workflow between them.
I have a feeling I have the train.batch in the completely wrong spot and that it should probably be in the feed_dict def, but no idea otherwise. Thanks!
import tensorflow as tf
from tensorflow.python.framework import dtypes
# Input - 216x216x1 images; ~900 training images, ~350 validation
# Want to do batches of 5 for training, 20 for validation
learn_rate = .0001
drop_keep = 0.9
train_batch = 5
test_batch = 20
epochs = 1
iterations = int((885/train_batch) * epochs)
#
#
# A BUNCH OF (graph-building) HELPER DEFINITIONS EXCLUDED FOR BREVITY
#
#
#x_init will be fed a list of .jpg filenames (ex: [/file0.jpg, /file1.jpg, ...])
#y_init will be fed an array of one-hot classes (ex: [[0,1,0], [1,0,0], ...])
sess = tf.InteractiveSession()
with tf.name_scope('input'):
batch_size = tf.placeholder(tf.int32)
keep_prob = tf.placeholder(tf.float32)
x_init = tf.placeholder(dtype=tf.string, shape=(None))
y_init = tf.placeholder(dtype=np.int32, shape=(None,3)) #3 classes
image, label = tf.train.slice_input_producer([x_init, y_init])
file = tf.read_file(image)
image = tf.image.decode_jpeg(file, channels=1)
image = tf.cast(image, tf.float32)
image.set_shape([216,216,1])
label = tf.cast(label, tf.int32)
images, labels = tf.train.batch([image, label], batch_size=batch_size)
conv1 = conv_layer(images, [5,5,1], 40, 'conv1')
#
#
# skip the rest of graph defining/functions (merged,train_step)
# very similar to what is found in the MNIST example.
#
#
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(OUTPUT_LOC + '/train',sess.graph)
test_writer = tf.summary.FileWriter(OUTPUT_LOC + '/test')
sess.run(tf.global_variables_initializer())
#xTrain, yTrain, xTest, yTest are the train/valid images/labels lists
def feed_dict(train=True):
if train:
batch = train_batch
keep = drop_keep
xval = xTrain
yval = yTrain
else:
batch = test_batch
keep = 1
xval = xTest
yval = yTest
return({x_init:xval, y_init:yval, batch_size:batch, keep_prob:keep})
#If I run "threads", I get the error. It works up until here.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
#Don't know what works here or what doesn't.
for i in range(iterations):
if i % 10 == 0:
summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
test_writer.add_summary(summary, i)
print('Accuracy at step %s: %s' % (i, acc))
else:
if i % 100 == 99:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata)
train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
train_writer.add_summary(summary, i)
print('Adding run metadata for', i)
else: # Record a summary
summary, _ = sess.run([merged, train_step],feed_dict=feed_dict(True))
train_writer.add_summary(summary, i)
coord.request_stop()
train_writer.close()
test_writer.close()
sess.close()
I am trying to use Tensorboard to visualize my training procedure. My purpose is, when every epoch completed, I would like to test the network's accuracy using the whole validation dataset, and store this accuracy result into a summary file, so that I can visualize it in Tensorboard.
I know Tensorflow has summary_op to do it, however it seems only work for one batch when running the code sess.run(summary_op). I need to calculate the accuracy for the whole dataset. How?
Is there any example to do it?
Define a tf.scalar_summary that accepts a placeholder:
accuracy_value_ = tf.placeholder(tf.float32, shape=())
accuracy_summary = tf.scalar_summary('accuracy', accuracy_value_)
Then calculate the accuracy for the whole dataset (define a routine that calculates the accuracy for every batch in the dataset and extract the mean value) and save it into a python variable, let's call it va.
Once you have the value of va, just run the accuracy_summary op, feeding the accuracy_value_ placeholder:
sess.run(accuracy_summary, feed_dict={accuracy_value_: va})
I implement a naive one-layer model as an example to classify MNIST dataset and visualize validation accuracy in Tensorboard, it works for me.
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
import os
# number of epoch
num_epoch = 1000
model_dir = '/tmp/tf/onelayer_model/accu_info'
# mnist dataset location, change if you need
data_dir = '../data/mnist'
# load MNIST dataset without one hot
dataset = read_data_sets(data_dir, one_hot=False)
# Create placeholder for input images X and labels y
X = tf.placeholder(tf.float32, [None, 784])
# one_hot = False
y = tf.placeholder(tf.int32)
# One layer model graph
W = tf.Variable(tf.truncated_normal([784, 10], stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[10]))
logits = tf.nn.relu(tf.matmul(X, W) + b)
init = tf.initialize_all_variables()
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)
# loss function
loss = tf.reduce_mean(cross_entropy)
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
_, top_1_op = tf.nn.top_k(logits)
top_1 = tf.reshape(top_1_op, shape=[-1])
correct_classification = tf.cast(tf.equal(top_1, y), tf.float32)
# accuracy function
acc = tf.reduce_mean(correct_classification)
# define info that is used in SummaryWritter
acc_summary = tf.scalar_summary('valid_accuracy', acc)
valid_summary_op = tf.merge_summary([acc_summary])
with tf.Session() as sess:
# initialize all the variable
sess.run(init)
print("Writing Summaries to %s" % model_dir)
train_summary_writer = tf.train.SummaryWriter(model_dir, sess.graph)
# load validation dataset
valid_x = dataset.validation.images
valid_y = dataset.validation.labels
for epoch in xrange(num_epoch):
batch_x, batch_y = dataset.train.next_batch(100)
feed_dict = {X: batch_x, y: batch_y}
_, acc_value, loss_value = sess.run(
[train_op, acc, loss], feed_dict=feed_dict)
vsummary = sess.run(valid_summary_op,
feed_dict={X: valid_x,
y: valid_y})
# Write validation accuracy summary
train_summary_writer.add_summary(vsummary, epoch)
Using batching with your validation set is possible in case you are using tf.metrics ops, which use internal counters. Here is a simplified example:
model = create_model()
tf.summary.scalar('cost', model.cost_op)
acc_value_op, acc_update_op = tf.metrics.accuracy(labels,predictions)
summary_common = tf.summary.merge_all()
summary_valid = tf.summary.merge([
tf.summary.scalar('accuracy', acc_value_op),
# other metrics here...
])
with tf.Session() as sess:
train_writer = tf.summary.FileWriter(logs_path + '/train',
sess.graph)
valid_writer = tf.summary.FileWriter(logs_path + '/valid')
While training, only write the common summary using your train-writer:
summary = sess.run(summary_common)
train_writer.add_summary(summary, tf.train.global_step(sess, gstep_op))
train_writer.flush()
After every validation, write both summaries using the valid-writer:
gstep, summaryc, summaryv = sess.run([gstep_op, summary_common, summary_valid])
valid_writer.add_summary(summaryc, gstep)
valid_writer.add_summary(summaryv, gstep)
valid_writer.flush()
When using tf.metrics, don't forget to reset the internal counters (local variables) before every validation step.