I am using Tensorflow and Python to define and train a NN in the following manner:
# clear old tensor values that cause a problem when rerunning
tf.reset_default_graph()
#network dims
hiddenneurons=12
input_dim=3
# Set up the model
def neural_net_model(X_data,input_dim):
W_1 = tf.Variable(tf.random_uniform([input_dim,hiddenneurons]),name='W_1')
b_1 = tf.Variable(tf.zeros([hiddenneurons]),name='b_1')
layer_1 = tf.add(tf.matmul(X_data,W_1), b_1)
layer_1 = tf.nn.tanh(layer_1)
# layer 1 multiplying and adding bias then activation function
W_O = tf.Variable(tf.random_uniform([hiddenneurons,1]),name='W_O')
b_O = tf.Variable(tf.zeros([1]),name='b_O')
output = tf.add(tf.matmul(layer_1,W_O), b_O)
# O/p layer multiplying and adding bias then activation function
return output
xs = tf.placeholder("float")
ys = tf.placeholder("float")
output = neural_net_model(xs,3)
cost = tf.reduce_mean(tf.square(output-ys))
train = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
c_t = []
c_test = []
with tf.Session() as sess:
# Initiate session and initialize all vaiables
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
for i in range(10):
for j in range(X_train.shape[0]):
sess.run([cost,train],feed_dict={xs:X_train.values[j,:].reshape(1,3), ys:y_train.values[j]})
# Run cost and train with each sample
c_t.append(sess.run(cost, feed_dict={xs:X_train,ys:y_train}))
c_test.append(sess.run(cost, feed_dict={xs:X_test,ys:y_test}))
print('Epoch :',i,'Cost :',c_t[i])
plt.scatter(i,c_t[i])
pred = sess.run(output, feed_dict={xs:X_test})
print(xs)
# predict output of test data after training
print('Cost :',sess.run(cost, feed_dict={xs:X_test,ys:y_test}))
# save model
saver.save(sess,'save folder')
print('Model saved')
At this point, if I output the trainable variables with tf.trainable_variables(), I get four variables of the expected shape for W_1:0, W_O:0, b_1:0, b_O:0.
I want to be able have a different file which restores the model, then uses the same weights and biases as saved, to allow me to run a variety of testing data through.
I am having trouble restoring the model and persuading it to reuse the past weights and biases. My restore code looks like this:
# clear old tensor values
tf.reset_default_graph()
newsaver = tf.train.import_meta_graph(modelloc)
def neural_net_model(X_data,input_dim):
W_1 = tf.Variable(tf.random_uniform([input_dim,hiddenneurons]))
b_1 = tf.Variable(tf.zeros([hiddenneurons]))
layer_1 = tf.add(tf.matmul(X_data,W_1), b_1)
layer_1 = tf.nn.tanh(layer_1)
W_O = tf.Variable(tf.random_uniform([hiddenneurons,1]))
b_O = tf.Variable(tf.zeros([1]))
output = tf.add(tf.matmul(layer_1,W_O), b_O)
xs = tf.placeholder("float")
ys = tf.placeholder("float")
output = neural_net_model(xs,3)
with tf.Session() as sessr:
sessr.run(tf.global_variables_initializer())
newsaver.restore(sessr, tf.train.latest_checkpoint('folder where .ckpt files are'))
pred = sessr.run(output, feed_dict={xs:X_test})
At this point, if I type tf.trainable_variables(), I get the details of the four tensors W_1:0, W_O:0, b_O:0, b_1:0, plus four new ones Variable_0, Variable_1:0, Variable_2:0, Variable_3_0. This means that the data is tested on these new variables and does not give the desired result. I don't seem to be able to use the restored weights and biases W_1,W_O, b_1,b_O.
I KNOW that I am reinitialising the variables when I don't need to, and this is the problem, and I have read this post here in detail. I have also read this, and this and many others. If I remove the repitition of the model definition, 'output', or 'neural_net_model' becomes undefined and the code doesn't work. If I try to specify W_1 etc. in any other way, the code doesn't work.
Related
I am new to tensorflow and I am trying to build an image classifier. I have successfully created the model and I am trying to predict a single image after restoring the model. I have gone through various tutorials (https://github.com/sankit1/cv-tricks.com/blob/master/Tensorflow-tutorials/tutorial-2-image-classifier/predict.py) but I can't figure out the feed-dict thing in my code. I am stuck at predict fnction after loading the saved model. Can someone please help me and tell me what to do after loading all the variables from the saved model?
This is the train function which returns the parameters and save them in a model.
def trainModel(train, test, learning_rate=0.0001, num_epochs=2, minibatch_size=32, graph_filename='costs'):
"""
Implements a three-layer tensorflow neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SOFTMAX.
Input:
train : training set
test : test set
learning_rate : learning rate
num_epochs : number of epochs
minibatch_size : size of minibatch
print_cost : True to print the cost every epoch
Returns:
parameters : parameters learnt by the model
"""
ops.reset_default_graph() #for rerunning the model without resetting tf vars
# input and output shapes
(n_x, m) = train.images.T.shape
n_y = train.labels.T.shape[0]
costs = [] #var for storing the costs for later use
# create placeholders
X, Y = placeholderCreator(n_x, n_y)
parameters = paramInitializer()
# Forward propagation
Z3 = forwardPropagation(X, parameters)
# Cost function
cost = costCalc(Z3, Y)
#Backpropagation using adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# Initialize tf variables
init = tf.global_variables_initializer()
minibatch_size = 32
# Start session to compute Tensorflow graph
with tf.Session() as sess:
# Run initialization
sess.run(init)
for epoch in range(num_epochs): # Training loop
epoch_cost = 0.
num_minibatches = int(m / minibatch_size)
for i in range(num_minibatches):
minibatch_X, minibatch_Y = train.next_batch(minibatch_size) # Get next batch of training data and labels
_, minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X.T, Y: minibatch_Y.T}) # Execute optimizer and cost function
epoch_cost += minibatch_cost / num_minibatches # Update epoch cost
saver = tf.train.Saver()
# Save parameters
parameters = sess.run(parameters)
saver.save(sess, "~/trained-model.ckpt")
return parameters
And this is my predict function where I am trying to predict an image. I have converted that image into MNIST format for ease of use (predicting_data). I load the model that I saved, use a softmax function on the output of 3rd layer (final output).
def predict():
train = predicting_data.train
(n_x, m) = train.images.T.shape
n_y = train.labels.T.shape[0]
X, Y = placeholderCreator(n_x, n_y)
with tf.Session() as sess:
new_saver = tf.train.import_meta_graph('~/trained-model.ckpt.meta')
new_saver.restore(sess, '~/trained-model.ckpt')
W1 = tf.get_default_graph().get_tensor_by_name('W1:0')
b1 = tf.get_default_graph().get_tensor_by_name('b1:0')
W2 = tf.get_default_graph().get_tensor_by_name('W2:0')
b2 = tf.get_default_graph().get_tensor_by_name('b2:0')
W3 = tf.get_default_graph().get_tensor_by_name('W3:0')
b3 = tf.get_default_graph().get_tensor_by_name('b3:0')
# forward propagation
Z1 = tf.add(tf.matmul(W1,X), b1)
A1 = tf.nn.relu(Z1)
Z2 = tf.add(tf.matmul(W2,A1), b2)
A2 = tf.nn.relu(Z2)
Z3 = tf.add(tf.matmul(W3,A2), b3)
y_pred = tf.nn.softmax(Z3) ####what to do after this????
cost = sess.run(y_pred, feed_dict={X: train.images.T})
Thank you in advance!
As vijay says in his comment:
Your predict part is not right, you need to get the input and predict tensors from the saved graph using the get_tensor_by_name() function and then use it in your sess.run
If you look at this post, it covers a similar problem and has some code examples.
In your code, you can pass 1 to the next_batch method and get just one image.
minibatch_X, minibatch_Y = train.next_batch(1)
I am running distributed an mnist model in distributed TensorFlow. I would like to monitor "manually" the evolution of the global_step for debugging purposes. What is the best and clean way to get the global step in a distributed TensorFlow setting?
My code below
...
with tf.device(device):
images = tf.placeholder(tf.float32, [None, 784], name='image_input')
labels = tf.placeholder(tf.float32, [None], name='label_input')
data = read_data_sets(FLAGS.data_dir,
one_hot=False,
fake_data=False)
logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)
loss = mnist.loss(logits, labels)
loss = tf.Print(loss, [loss], message="Loss = ")
train_op = mnist.training(loss, FLAGS.learning_rate)
hooks=[tf.train.StopAtStepHook(last_step=FLAGS.nb_steps)]
with tf.train.MonitoredTrainingSession(
master=target,
is_chief=(FLAGS.task_index == 0),
checkpoint_dir=FLAGS.log_dir,
hooks = hooks) as sess:
while not sess.should_stop():
xs, ys = data.train.next_batch(FLAGS.batch_size, fake_data=False)
sess.run([train_op], feed_dict={images:xs, labels:ys})
global_step_value = # ... what is the clean way to get this variable
Normally a good practice is to initialize your global step variable in your graph-defining process, e.g. global_step = tf.Variable(0, trainable=False, name='global_step'). Then you can use graph.get_tensor_by_name("global_step:0") to get your global step easily.
I faced a problem with properly restoring the saved model in tensorflow. I created the Bidirectional RNN model in tensorflow with following code:
batchX_placeholder = tf.placeholder(tf.float32, [None, timesteps, 1],
name="batchX_placeholder")])
batchY_placeholder = tf.placeholder(tf.float32, [None, num_classes],
name="batchY_placeholder")
weights = tf.Variable(np.random.rand(2*STATE_SIZE, num_classes),
dtype=tf.float32, name="weights")
biases = tf.Variable(np.zeros((1, num_classes)), dtype=tf.float32,
name="biases")
logits = BiRNN(batchX_placeholder, weights, biases)
with tf.name_scope("prediction"):
prediction = tf.nn.softmax(logits)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=logits, labels=batchY_placeholder))
lr = tf.Variable(learning_rate, trainable=False, dtype=tf.float32,
name='lr')
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss_op)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
The architecture of BiRNN created with the following function:
def BiRNN(x, weights, biases):
# Unstack to get a list of 'time_steps' tensors of shape (batch_size,
# num_input)
x = tf.unstack(x, time_steps, 1)
# Forward and Backward direction cells
lstm_fw_cell = rnn.BasicLSTMCell(STATE_SIZE, forget_bias=1.0)
lstm_bw_cell = rnn.BasicLSTMCell(STATE_SIZE, forget_bias=1.0)
outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell,
lstm_bw_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights) + biases
Then I train a model and save it after each 200 steps:
with tf.Session() as sess:
sess.run(init_op)
current_step = 0
for batch_x, batch_y in get_minibatch():
sess.run(train_op, feed_dict={batchX_placeholder: batch_x,
batchY_placeholder: batch_y})
current_step += 1
if current_step % 200 == 0:
saver.save(sess, os.path.join(model_dir, "model")
To run the saved model in inference mode I use saved tensorflow graph in "model.meta" file:
graph = tf.get_default_graph()
saver = tf.train.import_meta_graph(os.path.join(model_dir, "model.meta"))
sess = tf.Session()
saver.restore(sess, tf.train.latest_checkpoint(model_dir)
weights = graph.get_tensor_by_name("weights:0")
biases = graph.get_tensor_by_name("biases:0")
batchX_placeholder = graph.get_tensor_by_name("batchX_placeholder:0")
batchY_placeholder = graph.get_tensor_by_name("batchY_placeholder:0")
logits = BiRNN(batchX_placeholder, weights, biases)
prediction = graph.get_operation_by_name("prediction/Softmax")
argmax_pred = tf.argmax(prediction, 1)
init = tf.global_variables_initializer()
sess.run(init)
for x_seq, y_gt in get_sequence():
_, y_pred = sess.run([prediction, argmax_pred],
feed_dict={batchX_placeholder: [x_seq]],
batchY_placeholder: [[0.0, 0.0]]})
print("Y ground true: " + str(y_gt) + ", Y pred: " + str(y_pred[0]))
And when I run the code in inference mode, I get different results each time I launch it. It seems that output neurons from the softmax layer randomly bundled with different output classes.
So, my question is: How can I save and then correctly restore the model in tensorflow, so that all neurons properly bundled with corresponding output classes?
There is no need to call tf.global_variables_initializer(), I think that is your problem.
I removed some operations: logits, weights and biases since you don't need them, all those are already loaded, use graph.get_tensor_by_name to get them.
For the prediction, get the tensor instead of the operation. (see this answer):
This is the code:
graph = tf.get_default_graph()
saver = tf.train.import_meta_graph(os.path.join(model_dir, "model.meta"))
sess = tf.Session()
saver.restore(sess, tf.train.latest_checkpoint(model_dir))
batchX_placeholder = graph.get_tensor_by_name("batchX_placeholder:0")
batchY_placeholder = graph.get_tensor_by_name("batchY_placeholder:0")
prediction = graph.get_tensor_by_name("prediction/Softmax:0")
argmax_pred = tf.argmax(prediction, 1)
Edit 1: I notice that I wasn't clear on why you got different results.
And when I run the code in inference mode, I get different results
each time I launch it.
Notice that although you used the weights from the loaded model, you are creating the BiRNN again, and the BasicLSTMCell also have weights and other variables that you don't set from your loaded model, hence they need to be initialized (with new random values) resulting in an untrained model again.
I have stopped training at some point and saved checkpoint, meta files etc.
Now when I want to resume training, I want to start with last running learning rate of the optimizer. Can you provide a example of doing so ?
For those coming here (like me) wondering whether the last learning rate is automatically restored: tf.train.exponential_decay doesn't add any Variables to the graph, it only adds the operations necessary to derive the correct current learning rate value given a certain global_step value. This way, you only need to checkpoint the global_step value (which is done by default normally) and, assuming you keep the same initial learning rate, decay steps and decay factor, you'll automatically pick up training where you left it, with the correct learning rate value.
Inspecting the checkpoint won't show any learning_rate variable (or similar), simply because there is no need for any.
This example code learns to add two numbers:
import tensorflow as tf
import numpy as np
import os
save_ckpt_dir = './add_ckpt'
ckpt_filename = 'add.ckpt'
save_ckpt_path = os.path.join(save_ckpt_dir, ckpt_filename)
if not os.path.isdir(save_ckpt_dir):
os.mkdir(save_ckpt_dir)
if [fname.startswith("add.ckpt") for fname in os.listdir(save_ckpt_dir)]: # prefer to load pre-trained net
load_ckpt_path = save_ckpt_path
else:
load_ckpt_path = None # train from scratch
def add_layer(inputs, in_size, out_size, activation_fn=None):
Weights = tf.Variable(tf.ones([in_size, out_size]), name='Weights')
biases = tf.Variable(tf.zeros([1, out_size]), name='biases')
Wx_plus_b = tf.add(tf.matmul(inputs, Weights), biases)
if activation_fn is None:
layer_output = Wx_plus_b
else:
layer_output = activation_fn(Wx_plus_b)
return layer_output
def produce_batch(batch_size=256):
"""Loads a single batch of data.
Args:
batch_size: The number of excersises in the batch.
Returns:
x : column vector of numbers
y : another column of numbers
xy_sum : the sum of the columns
"""
x = np.random.random(size=[batch_size, 1]) * 10
y = np.random.random(size=[batch_size, 1]) * 10
xy_sum = x + y
return x, y, xy_sum
with tf.name_scope("inputs"):
xs = tf.placeholder(tf.float32, [None, 1])
ys = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("correct_labels"):
xysums = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
with tf.name_scope("graph_body"):
prediction = add_layer(tf.concat([xs, ys], 1), 2, 1, activation_fn=None)
with tf.name_scope("loss_and_train"):
# the error between prediction and real data
loss = tf.reduce_mean(tf.reduce_sum(tf.square(xysums-prediction), reduction_indices=[1]))
# Passing global_step to minimize() will increment it at each step.
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
with tf.name_scope("init_load_save"):
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init)
if load_ckpt_path:
saver.restore(sess, load_ckpt_path)
for i in range(1000):
x, y, xy_sum = produce_batch(256)
_, global_step_np, loss_np, lr_np = sess.run([train_step, global_step, loss, lr], feed_dict={xs: x, ys: y, xysums: xy_sum})
if global_step_np % 100 == 0:
print("global step: {}, loss: {}, learning rate: {}".format(global_step_np, loss_np, lr_np))
saver.save(sess, save_ckpt_path)
if you run it a few times, you will see the learning rate decrease. It also saves the global step. The trick is here:
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
...
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
By default, saver.save will save all savable objects (including learning rate and global step). However, if tf.train.Saver is provided with var_list, saver.save will only save the vars included in var_list:
saver = tf.train.Saver(var_list = ..list of vars to save..)
sources:
https://www.tensorflow.org/api_docs/python/tf/train/exponential_decay
https://stats.stackexchange.com/questions/200063/tensorflow-adam-optimizer-with-exponential-decay
https://www.tensorflow.org/api_docs/python/tf/train/Saver (see "saveable objects")
I am trying to use Tensorboard to visualize my training procedure. My purpose is, when every epoch completed, I would like to test the network's accuracy using the whole validation dataset, and store this accuracy result into a summary file, so that I can visualize it in Tensorboard.
I know Tensorflow has summary_op to do it, however it seems only work for one batch when running the code sess.run(summary_op). I need to calculate the accuracy for the whole dataset. How?
Is there any example to do it?
Define a tf.scalar_summary that accepts a placeholder:
accuracy_value_ = tf.placeholder(tf.float32, shape=())
accuracy_summary = tf.scalar_summary('accuracy', accuracy_value_)
Then calculate the accuracy for the whole dataset (define a routine that calculates the accuracy for every batch in the dataset and extract the mean value) and save it into a python variable, let's call it va.
Once you have the value of va, just run the accuracy_summary op, feeding the accuracy_value_ placeholder:
sess.run(accuracy_summary, feed_dict={accuracy_value_: va})
I implement a naive one-layer model as an example to classify MNIST dataset and visualize validation accuracy in Tensorboard, it works for me.
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
import os
# number of epoch
num_epoch = 1000
model_dir = '/tmp/tf/onelayer_model/accu_info'
# mnist dataset location, change if you need
data_dir = '../data/mnist'
# load MNIST dataset without one hot
dataset = read_data_sets(data_dir, one_hot=False)
# Create placeholder for input images X and labels y
X = tf.placeholder(tf.float32, [None, 784])
# one_hot = False
y = tf.placeholder(tf.int32)
# One layer model graph
W = tf.Variable(tf.truncated_normal([784, 10], stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[10]))
logits = tf.nn.relu(tf.matmul(X, W) + b)
init = tf.initialize_all_variables()
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)
# loss function
loss = tf.reduce_mean(cross_entropy)
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
_, top_1_op = tf.nn.top_k(logits)
top_1 = tf.reshape(top_1_op, shape=[-1])
correct_classification = tf.cast(tf.equal(top_1, y), tf.float32)
# accuracy function
acc = tf.reduce_mean(correct_classification)
# define info that is used in SummaryWritter
acc_summary = tf.scalar_summary('valid_accuracy', acc)
valid_summary_op = tf.merge_summary([acc_summary])
with tf.Session() as sess:
# initialize all the variable
sess.run(init)
print("Writing Summaries to %s" % model_dir)
train_summary_writer = tf.train.SummaryWriter(model_dir, sess.graph)
# load validation dataset
valid_x = dataset.validation.images
valid_y = dataset.validation.labels
for epoch in xrange(num_epoch):
batch_x, batch_y = dataset.train.next_batch(100)
feed_dict = {X: batch_x, y: batch_y}
_, acc_value, loss_value = sess.run(
[train_op, acc, loss], feed_dict=feed_dict)
vsummary = sess.run(valid_summary_op,
feed_dict={X: valid_x,
y: valid_y})
# Write validation accuracy summary
train_summary_writer.add_summary(vsummary, epoch)
Using batching with your validation set is possible in case you are using tf.metrics ops, which use internal counters. Here is a simplified example:
model = create_model()
tf.summary.scalar('cost', model.cost_op)
acc_value_op, acc_update_op = tf.metrics.accuracy(labels,predictions)
summary_common = tf.summary.merge_all()
summary_valid = tf.summary.merge([
tf.summary.scalar('accuracy', acc_value_op),
# other metrics here...
])
with tf.Session() as sess:
train_writer = tf.summary.FileWriter(logs_path + '/train',
sess.graph)
valid_writer = tf.summary.FileWriter(logs_path + '/valid')
While training, only write the common summary using your train-writer:
summary = sess.run(summary_common)
train_writer.add_summary(summary, tf.train.global_step(sess, gstep_op))
train_writer.flush()
After every validation, write both summaries using the valid-writer:
gstep, summaryc, summaryv = sess.run([gstep_op, summary_common, summary_valid])
valid_writer.add_summary(summaryc, gstep)
valid_writer.add_summary(summaryv, gstep)
valid_writer.flush()
When using tf.metrics, don't forget to reset the internal counters (local variables) before every validation step.