How can I get the global_step in a MonitoredTrainingSession? - tensorflow

I am running distributed an mnist model in distributed TensorFlow. I would like to monitor "manually" the evolution of the global_step for debugging purposes. What is the best and clean way to get the global step in a distributed TensorFlow setting?
My code below
...
with tf.device(device):
images = tf.placeholder(tf.float32, [None, 784], name='image_input')
labels = tf.placeholder(tf.float32, [None], name='label_input')
data = read_data_sets(FLAGS.data_dir,
one_hot=False,
fake_data=False)
logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)
loss = mnist.loss(logits, labels)
loss = tf.Print(loss, [loss], message="Loss = ")
train_op = mnist.training(loss, FLAGS.learning_rate)
hooks=[tf.train.StopAtStepHook(last_step=FLAGS.nb_steps)]
with tf.train.MonitoredTrainingSession(
master=target,
is_chief=(FLAGS.task_index == 0),
checkpoint_dir=FLAGS.log_dir,
hooks = hooks) as sess:
while not sess.should_stop():
xs, ys = data.train.next_batch(FLAGS.batch_size, fake_data=False)
sess.run([train_op], feed_dict={images:xs, labels:ys})
global_step_value = # ... what is the clean way to get this variable

Normally a good practice is to initialize your global step variable in your graph-defining process, e.g. global_step = tf.Variable(0, trainable=False, name='global_step'). Then you can use graph.get_tensor_by_name("global_step:0") to get your global step easily.

Related

How to restore weights and biases in Tensorflow?

I am using Tensorflow and Python to define and train a NN in the following manner:
# clear old tensor values that cause a problem when rerunning
tf.reset_default_graph()
#network dims
hiddenneurons=12
input_dim=3
# Set up the model
def neural_net_model(X_data,input_dim):
W_1 = tf.Variable(tf.random_uniform([input_dim,hiddenneurons]),name='W_1')
b_1 = tf.Variable(tf.zeros([hiddenneurons]),name='b_1')
layer_1 = tf.add(tf.matmul(X_data,W_1), b_1)
layer_1 = tf.nn.tanh(layer_1)
# layer 1 multiplying and adding bias then activation function
W_O = tf.Variable(tf.random_uniform([hiddenneurons,1]),name='W_O')
b_O = tf.Variable(tf.zeros([1]),name='b_O')
output = tf.add(tf.matmul(layer_1,W_O), b_O)
# O/p layer multiplying and adding bias then activation function
return output
xs = tf.placeholder("float")
ys = tf.placeholder("float")
output = neural_net_model(xs,3)
cost = tf.reduce_mean(tf.square(output-ys))
train = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
c_t = []
c_test = []
with tf.Session() as sess:
# Initiate session and initialize all vaiables
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
for i in range(10):
for j in range(X_train.shape[0]):
sess.run([cost,train],feed_dict={xs:X_train.values[j,:].reshape(1,3), ys:y_train.values[j]})
# Run cost and train with each sample
c_t.append(sess.run(cost, feed_dict={xs:X_train,ys:y_train}))
c_test.append(sess.run(cost, feed_dict={xs:X_test,ys:y_test}))
print('Epoch :',i,'Cost :',c_t[i])
plt.scatter(i,c_t[i])
pred = sess.run(output, feed_dict={xs:X_test})
print(xs)
# predict output of test data after training
print('Cost :',sess.run(cost, feed_dict={xs:X_test,ys:y_test}))
# save model
saver.save(sess,'save folder')
print('Model saved')
At this point, if I output the trainable variables with tf.trainable_variables(), I get four variables of the expected shape for W_1:0, W_O:0, b_1:0, b_O:0.
I want to be able have a different file which restores the model, then uses the same weights and biases as saved, to allow me to run a variety of testing data through.
I am having trouble restoring the model and persuading it to reuse the past weights and biases. My restore code looks like this:
# clear old tensor values
tf.reset_default_graph()
newsaver = tf.train.import_meta_graph(modelloc)
def neural_net_model(X_data,input_dim):
W_1 = tf.Variable(tf.random_uniform([input_dim,hiddenneurons]))
b_1 = tf.Variable(tf.zeros([hiddenneurons]))
layer_1 = tf.add(tf.matmul(X_data,W_1), b_1)
layer_1 = tf.nn.tanh(layer_1)
W_O = tf.Variable(tf.random_uniform([hiddenneurons,1]))
b_O = tf.Variable(tf.zeros([1]))
output = tf.add(tf.matmul(layer_1,W_O), b_O)
xs = tf.placeholder("float")
ys = tf.placeholder("float")
output = neural_net_model(xs,3)
with tf.Session() as sessr:
sessr.run(tf.global_variables_initializer())
newsaver.restore(sessr, tf.train.latest_checkpoint('folder where .ckpt files are'))
pred = sessr.run(output, feed_dict={xs:X_test})
At this point, if I type tf.trainable_variables(), I get the details of the four tensors W_1:0, W_O:0, b_O:0, b_1:0, plus four new ones Variable_0, Variable_1:0, Variable_2:0, Variable_3_0. This means that the data is tested on these new variables and does not give the desired result. I don't seem to be able to use the restored weights and biases W_1,W_O, b_1,b_O.
I KNOW that I am reinitialising the variables when I don't need to, and this is the problem, and I have read this post here in detail. I have also read this, and this and many others. If I remove the repitition of the model definition, 'output', or 'neural_net_model' becomes undefined and the code doesn't work. If I try to specify W_1 etc. in any other way, the code doesn't work.

How to restore saved BiRNN model in tensorflow so that all output neurons correctly bundled to the corresponding output classes

I faced a problem with properly restoring the saved model in tensorflow. I created the Bidirectional RNN model in tensorflow with following code:
batchX_placeholder = tf.placeholder(tf.float32, [None, timesteps, 1],
name="batchX_placeholder")])
batchY_placeholder = tf.placeholder(tf.float32, [None, num_classes],
name="batchY_placeholder")
weights = tf.Variable(np.random.rand(2*STATE_SIZE, num_classes),
dtype=tf.float32, name="weights")
biases = tf.Variable(np.zeros((1, num_classes)), dtype=tf.float32,
name="biases")
logits = BiRNN(batchX_placeholder, weights, biases)
with tf.name_scope("prediction"):
prediction = tf.nn.softmax(logits)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=logits, labels=batchY_placeholder))
lr = tf.Variable(learning_rate, trainable=False, dtype=tf.float32,
name='lr')
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss_op)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
The architecture of BiRNN created with the following function:
def BiRNN(x, weights, biases):
# Unstack to get a list of 'time_steps' tensors of shape (batch_size,
# num_input)
x = tf.unstack(x, time_steps, 1)
# Forward and Backward direction cells
lstm_fw_cell = rnn.BasicLSTMCell(STATE_SIZE, forget_bias=1.0)
lstm_bw_cell = rnn.BasicLSTMCell(STATE_SIZE, forget_bias=1.0)
outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell,
lstm_bw_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights) + biases
Then I train a model and save it after each 200 steps:
with tf.Session() as sess:
sess.run(init_op)
current_step = 0
for batch_x, batch_y in get_minibatch():
sess.run(train_op, feed_dict={batchX_placeholder: batch_x,
batchY_placeholder: batch_y})
current_step += 1
if current_step % 200 == 0:
saver.save(sess, os.path.join(model_dir, "model")
To run the saved model in inference mode I use saved tensorflow graph in "model.meta" file:
graph = tf.get_default_graph()
saver = tf.train.import_meta_graph(os.path.join(model_dir, "model.meta"))
sess = tf.Session()
saver.restore(sess, tf.train.latest_checkpoint(model_dir)
weights = graph.get_tensor_by_name("weights:0")
biases = graph.get_tensor_by_name("biases:0")
batchX_placeholder = graph.get_tensor_by_name("batchX_placeholder:0")
batchY_placeholder = graph.get_tensor_by_name("batchY_placeholder:0")
logits = BiRNN(batchX_placeholder, weights, biases)
prediction = graph.get_operation_by_name("prediction/Softmax")
argmax_pred = tf.argmax(prediction, 1)
init = tf.global_variables_initializer()
sess.run(init)
for x_seq, y_gt in get_sequence():
_, y_pred = sess.run([prediction, argmax_pred],
feed_dict={batchX_placeholder: [x_seq]],
batchY_placeholder: [[0.0, 0.0]]})
print("Y ground true: " + str(y_gt) + ", Y pred: " + str(y_pred[0]))
And when I run the code in inference mode, I get different results each time I launch it. It seems that output neurons from the softmax layer randomly bundled with different output classes.
So, my question is: How can I save and then correctly restore the model in tensorflow, so that all neurons properly bundled with corresponding output classes?
There is no need to call tf.global_variables_initializer(), I think that is your problem.
I removed some operations: logits, weights and biases since you don't need them, all those are already loaded, use graph.get_tensor_by_name to get them.
For the prediction, get the tensor instead of the operation. (see this answer):
This is the code:
graph = tf.get_default_graph()
saver = tf.train.import_meta_graph(os.path.join(model_dir, "model.meta"))
sess = tf.Session()
saver.restore(sess, tf.train.latest_checkpoint(model_dir))
batchX_placeholder = graph.get_tensor_by_name("batchX_placeholder:0")
batchY_placeholder = graph.get_tensor_by_name("batchY_placeholder:0")
prediction = graph.get_tensor_by_name("prediction/Softmax:0")
argmax_pred = tf.argmax(prediction, 1)
Edit 1: I notice that I wasn't clear on why you got different results.
And when I run the code in inference mode, I get different results
each time I launch it.
Notice that although you used the weights from the loaded model, you are creating the BiRNN again, and the BasicLSTMCell also have weights and other variables that you don't set from your loaded model, hence they need to be initialized (with new random values) resulting in an untrained model again.

how to restore the learning rate in TF from previously saved checkpoint ?

I have stopped training at some point and saved checkpoint, meta files etc.
Now when I want to resume training, I want to start with last running learning rate of the optimizer. Can you provide a example of doing so ?
For those coming here (like me) wondering whether the last learning rate is automatically restored: tf.train.exponential_decay doesn't add any Variables to the graph, it only adds the operations necessary to derive the correct current learning rate value given a certain global_step value. This way, you only need to checkpoint the global_step value (which is done by default normally) and, assuming you keep the same initial learning rate, decay steps and decay factor, you'll automatically pick up training where you left it, with the correct learning rate value.
Inspecting the checkpoint won't show any learning_rate variable (or similar), simply because there is no need for any.
This example code learns to add two numbers:
import tensorflow as tf
import numpy as np
import os
save_ckpt_dir = './add_ckpt'
ckpt_filename = 'add.ckpt'
save_ckpt_path = os.path.join(save_ckpt_dir, ckpt_filename)
if not os.path.isdir(save_ckpt_dir):
os.mkdir(save_ckpt_dir)
if [fname.startswith("add.ckpt") for fname in os.listdir(save_ckpt_dir)]: # prefer to load pre-trained net
load_ckpt_path = save_ckpt_path
else:
load_ckpt_path = None # train from scratch
def add_layer(inputs, in_size, out_size, activation_fn=None):
Weights = tf.Variable(tf.ones([in_size, out_size]), name='Weights')
biases = tf.Variable(tf.zeros([1, out_size]), name='biases')
Wx_plus_b = tf.add(tf.matmul(inputs, Weights), biases)
if activation_fn is None:
layer_output = Wx_plus_b
else:
layer_output = activation_fn(Wx_plus_b)
return layer_output
def produce_batch(batch_size=256):
"""Loads a single batch of data.
Args:
batch_size: The number of excersises in the batch.
Returns:
x : column vector of numbers
y : another column of numbers
xy_sum : the sum of the columns
"""
x = np.random.random(size=[batch_size, 1]) * 10
y = np.random.random(size=[batch_size, 1]) * 10
xy_sum = x + y
return x, y, xy_sum
with tf.name_scope("inputs"):
xs = tf.placeholder(tf.float32, [None, 1])
ys = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("correct_labels"):
xysums = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
with tf.name_scope("graph_body"):
prediction = add_layer(tf.concat([xs, ys], 1), 2, 1, activation_fn=None)
with tf.name_scope("loss_and_train"):
# the error between prediction and real data
loss = tf.reduce_mean(tf.reduce_sum(tf.square(xysums-prediction), reduction_indices=[1]))
# Passing global_step to minimize() will increment it at each step.
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
with tf.name_scope("init_load_save"):
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init)
if load_ckpt_path:
saver.restore(sess, load_ckpt_path)
for i in range(1000):
x, y, xy_sum = produce_batch(256)
_, global_step_np, loss_np, lr_np = sess.run([train_step, global_step, loss, lr], feed_dict={xs: x, ys: y, xysums: xy_sum})
if global_step_np % 100 == 0:
print("global step: {}, loss: {}, learning rate: {}".format(global_step_np, loss_np, lr_np))
saver.save(sess, save_ckpt_path)
if you run it a few times, you will see the learning rate decrease. It also saves the global step. The trick is here:
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
...
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
By default, saver.save will save all savable objects (including learning rate and global step). However, if tf.train.Saver is provided with var_list, saver.save will only save the vars included in var_list:
saver = tf.train.Saver(var_list = ..list of vars to save..)
sources:
https://www.tensorflow.org/api_docs/python/tf/train/exponential_decay
https://stats.stackexchange.com/questions/200063/tensorflow-adam-optimizer-with-exponential-decay
https://www.tensorflow.org/api_docs/python/tf/train/Saver (see "saveable objects")

restore a model trained with variable input length in tensorflow results in InvalidArgumentError

I am rather new to tensorflow and am currently experimenting with models of varying complexity. I have a problem with the save and restore functionality of the package. As far as I did understand the tutorials, I should be able to restore a trained graph and run it with some new input at some later point. However, I get the following error when I try to do just that.:
InvalidArgumentError (see above for traceback): Shape [-1,10] has negative dimensions
[[Node: Placeholder = Placeholderdtype=DT_FLOAT, shape=[?,10], _device="/job:localhost/replica:0/task:0/cpu:0"]]
My understanding of the message is that the restored graph does not like one dimension to be left arbitrary, which in turn is necessary for practical cases where I don't know beforehand how large my input will be. A code snippet as a minimal example, producing the error above, can be found below. I know how to restore each tensor individually but this gets impractical pretty quickly when the models grow in complexity. I am thankful for any help I get and apologize in case my question is stupid.
import numpy as np
import tensorflow as tf
def generate_random_input():
alist = []
for _ in range(10):
alist.append(np.random.uniform(-1, 1, 100))
return np.array(alist).T
def generate_random_target():
return np.random.uniform(-1, 1, 100)
x = tf.placeholder('float', [None, 10])
y = tf.placeholder('float')
# the model
w1 = tf.get_variable('w1', [10, 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(seed=1))
b1 = tf.get_variable('b1', [1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(seed=1))
result = tf.add(tf.matmul(x, w1), b1, name='result')
loss = tf.reduce_mean(tf.losses.mean_squared_error(predictions=result, labels=y))
optimizer = tf.train.AdamOptimizer(0.03).minimize(loss)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run([optimizer, loss], feed_dict={x: generate_random_input(), y: generate_random_target()})
saver.save(sess, 'file_name')
# now load the model in another session:
sess2 = tf.Session()
saver = tf.train.import_meta_graph('file_name.meta')
saver.restore(sess2, tf.train.latest_checkpoint('./'))
graph = tf.get_default_graph()
pred = graph.get_operation_by_name('result')
test_result = sess2.run(pred, feed_dict={x: generate_random_input()})
in the last line, you don't feed_dict the label_palceholder with the data. So in the placeholder, the [-1] dimension is still -1, other than the batch size. That's the cause.
I'm having the exact same problem as you. I'm importing and testing a bunch of different CNNs with different layer sizes and testing on various datasets. You can stick your model creation in a function like so and recreate it in your other code:
def create_model():
x = tf.placeholder('float', [None, 10])
y = tf.placeholder('float')
w1 = tf.get_variable('w1', [10, 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(seed=1))
b1 = tf.get_variable('b1', [1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(seed=1))
result = tf.add(tf.matmul(x, w1), b1, name='result')
return x, y, result
x, y, result = create_model()
loss = tf.reduce_mean(tf.losses.mean_squared_error(predictions=result, labels=y))
optimizer = tf.train.AdamOptimizer(0.03).minimize(loss)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run([optimizer, loss], feed_dict={x: generate_random_input(), y: generate_random_target()})
saver.save(sess, 'file_name')
# now load the model in another session:
sess2 = tf.Session()
# This stuff is optional if everything is the same scope
x, y, result = create_model()
saver = tf.train.Saver()
# loss = ... if you want loss
# Now just restore the weights and run
saver.restore(sess, 'file_name')
test_result = sess2.run(pred, feed_dict={x: generate_random_input()})
This is a bit tedious if I want to import many complex architectures with different dimensions. For our situation, I don't know if there's any other way to restore an entire model than to recreate that architecture first in your second session.

Add a summary of accuracy of the whole train/test dataset in Tensorflow

I am trying to use Tensorboard to visualize my training procedure. My purpose is, when every epoch completed, I would like to test the network's accuracy using the whole validation dataset, and store this accuracy result into a summary file, so that I can visualize it in Tensorboard.
I know Tensorflow has summary_op to do it, however it seems only work for one batch when running the code sess.run(summary_op). I need to calculate the accuracy for the whole dataset. How?
Is there any example to do it?
Define a tf.scalar_summary that accepts a placeholder:
accuracy_value_ = tf.placeholder(tf.float32, shape=())
accuracy_summary = tf.scalar_summary('accuracy', accuracy_value_)
Then calculate the accuracy for the whole dataset (define a routine that calculates the accuracy for every batch in the dataset and extract the mean value) and save it into a python variable, let's call it va.
Once you have the value of va, just run the accuracy_summary op, feeding the accuracy_value_ placeholder:
sess.run(accuracy_summary, feed_dict={accuracy_value_: va})
I implement a naive one-layer model as an example to classify MNIST dataset and visualize validation accuracy in Tensorboard, it works for me.
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
import os
# number of epoch
num_epoch = 1000
model_dir = '/tmp/tf/onelayer_model/accu_info'
# mnist dataset location, change if you need
data_dir = '../data/mnist'
# load MNIST dataset without one hot
dataset = read_data_sets(data_dir, one_hot=False)
# Create placeholder for input images X and labels y
X = tf.placeholder(tf.float32, [None, 784])
# one_hot = False
y = tf.placeholder(tf.int32)
# One layer model graph
W = tf.Variable(tf.truncated_normal([784, 10], stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[10]))
logits = tf.nn.relu(tf.matmul(X, W) + b)
init = tf.initialize_all_variables()
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)
# loss function
loss = tf.reduce_mean(cross_entropy)
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
_, top_1_op = tf.nn.top_k(logits)
top_1 = tf.reshape(top_1_op, shape=[-1])
correct_classification = tf.cast(tf.equal(top_1, y), tf.float32)
# accuracy function
acc = tf.reduce_mean(correct_classification)
# define info that is used in SummaryWritter
acc_summary = tf.scalar_summary('valid_accuracy', acc)
valid_summary_op = tf.merge_summary([acc_summary])
with tf.Session() as sess:
# initialize all the variable
sess.run(init)
print("Writing Summaries to %s" % model_dir)
train_summary_writer = tf.train.SummaryWriter(model_dir, sess.graph)
# load validation dataset
valid_x = dataset.validation.images
valid_y = dataset.validation.labels
for epoch in xrange(num_epoch):
batch_x, batch_y = dataset.train.next_batch(100)
feed_dict = {X: batch_x, y: batch_y}
_, acc_value, loss_value = sess.run(
[train_op, acc, loss], feed_dict=feed_dict)
vsummary = sess.run(valid_summary_op,
feed_dict={X: valid_x,
y: valid_y})
# Write validation accuracy summary
train_summary_writer.add_summary(vsummary, epoch)
Using batching with your validation set is possible in case you are using tf.metrics ops, which use internal counters. Here is a simplified example:
model = create_model()
tf.summary.scalar('cost', model.cost_op)
acc_value_op, acc_update_op = tf.metrics.accuracy(labels,predictions)
summary_common = tf.summary.merge_all()
summary_valid = tf.summary.merge([
tf.summary.scalar('accuracy', acc_value_op),
# other metrics here...
])
with tf.Session() as sess:
train_writer = tf.summary.FileWriter(logs_path + '/train',
sess.graph)
valid_writer = tf.summary.FileWriter(logs_path + '/valid')
While training, only write the common summary using your train-writer:
summary = sess.run(summary_common)
train_writer.add_summary(summary, tf.train.global_step(sess, gstep_op))
train_writer.flush()
After every validation, write both summaries using the valid-writer:
gstep, summaryc, summaryv = sess.run([gstep_op, summary_common, summary_valid])
valid_writer.add_summary(summaryc, gstep)
valid_writer.add_summary(summaryv, gstep)
valid_writer.flush()
When using tf.metrics, don't forget to reset the internal counters (local variables) before every validation step.