tensorflow error: restore checkpoint file - tensorflow

I built up my own convolutional neural network, in which I track the moving averages of all trainable variables (tensorflow 1.0):
variable_averages = tf.train.ExponentialMovingAverage(
0.9999, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
train_op = tf.group(apply_gradient_op, variables_averages_op)
saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
summary_op = tf.summary.merge(summaries)
init = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False))
sess.run(init)
# start queue runners
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
# training loop
start_time = time.time()
for step in range(FLAGS.max_steps):
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
start_time = time.time()
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 1 == 0:
# print current model status
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step/duration
sec_per_batch = duration/FLAGS.num_gpus
format_str = '{} step{}, loss {}, {} examples/sec, {} sec/batch'
print(format_str.format(datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 50 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 10 == 0 or step == FLAGS.max_steps:
print('save checkpoint')
# save checkpoint file
checkpoint_file = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_file, global_step=step)
This workes fine and checkpoint files are saved (saver version V2). Then I try to restore the checkpoints in a nother script for evaluating the model. There I have this piece of code
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(
MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
where I get the error "NotFoundError (see above for traceback): Key conv1/Variable/ExponentialMovingAverage not found in checkpoint" where conv1/variable/ is a variable scope.
This error ocuurs even before I try to restore the variables. Can you please help to solve it?
Thanks in advance
TheJude

I solved it in this way:
Call tf.reset_default_graph() before create second ExponentialMovingAverage(...) in the graph.
# reset the graph before create a new ema
tf.reset_default_graph()
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
It took me 2 hours...

Related

How to load the last checkpoint in TensorFlow?

I am practising with TensorFlow on this tutorial. The evaluate function depends on the training to load the latest checkpoint:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
start_epoch = 0
if ckpt_manager.latest_checkpoint:
start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
ckpt.restore(ckpt_manager.latest_checkpoint)
for epoch in range(start_epoch, EPOCHS):
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate(dataset):
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
if batch % 100 == 0:
print ('Epoch {} Batch {} Loss {:.4f}'.format(
epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
loss_plot.append(total_loss / num_steps)
ckpt_manager.save()
Without ckpt_manager.save(), the evaluation function does not work.
When we have already trained a model and the checkpoints are available in checkpoint_path. How should we load the model without training?
You can use tf.train.latest_checkpoint to get the latest checkpoint file and then load it manually using ckpt.restore:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
ckpt_path = tf.train.latest_checkpoint(checkpoint_path)
ckpt.restore(ckpt_path)

loading a trained model back into tensorflow

I have constructed a model in Tensorflow, which I've trained. Now I want to work with the outputs, so I want to load the Checkpoint, Meta and all the other files back into tensorlow.
I have used the following code to train the model:
# Logging
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train')
test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test')
validate_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/validate')
writer = tf.summary.FileWriter(FLAGS.summary_dir, sess.graph)
saver = tf.train.Saver() # for storing the best network
# Initialize variables
init = tf.global_variables_initializer()
sess.run(init)
# Best validation accuracy seen so far
bestValidation = -0.1
# Training loop
coord = tf.train.Coordinator() # coordinator for threads
threads = tf.train.start_queue_runners(coord = coord, sess=sess) # start queue thread
# Training loop
for i in range(FLAGS.maxIter):
xTrain, yTrain = sess.run(data_batch)
sess.run(train_step, feed_dict={x_data: xTrain, y_target: np.transpose([yTrain])})
summary = sess.run(merged, feed_dict={x_data: xTrain, y_target: np.transpose([yTrain])})
train_writer.add_summary(summary, i)
if ((i + 1) % 10 == 0):
print("Iteration:", i + 1, "/", FLAGS.maxIter)
summary = sess.run(merged, feed_dict={x_data: dataTest.data, y_target: np.transpose([dataTest.target])})
test_writer.add_summary(summary, i)
currentValidation, summary = sess.run([accuracy, merged], feed_dict={x_data: dataTest.data,
y_target: np.transpose(
[dataTest.target])})
validate_writer.add_summary(summary, i)
if (currentValidation > bestValidation and currentValidation <= 0.9):
bestValidation = currentValidation
saver.save(sess=sess, save_path=FLAGS.summary_dir + '/bestNetwork')
print("\tbetter network stored,", currentValidation, ">", bestValidation)
coord.request_stop() # ask threads to stop
coord.join(threads) # wait for threads to stop
Now I want to load the model back into Tensorflow. I want to be able to do a few things:
Work with the output I've already created for the training and test datasets.
Load new data to the model, and then be able to use the same weights to make new outputs.
I've tried using the following code to load the model back into tensorflow, but it doesn't work:
with tf.Session() as sess:
saver = tf.train.import_meta_graph(FLAGS.summary_dir + '/bestNetwork.meta')
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.summary_dir + '/checkpoint'))
I get the following error when running the code:
TypeError: expected bytes, NoneType found
As I've understod, I'm loading the meta graph from the previous section with the tf.train.import_meta_graph() function, and then I load the weights with the checkpoint part. So why is this not working?
You saved the model as bestNetwork. Try this instead:
saver.restore(sess,tf.train.latest_checkpoint(FLAGS.summary_dir + '/**bestNetwork**'))

Cannot load int variable from previous session in tensorflow 1.1

I have read many similar questions and just cannot get this to work properly.
I have my model being trained well and checkpoint files are being made every epoch. I want to have it so the program can continue from epoch x once reloaded and also for it to print that is on that epoch with every iteration. I could simply save the data outside of the checkpoint file, however I was also wanting to do this to give me confidence everything else is also being stored properly.
Unfortunately the value in the epoch/global_step variable is always still 0 when I restart.
import tensorflow as tf
import numpy as np
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(init_op, sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
sess.run(init_op)
return
with tf.Graph().as_default() as g:
# build models
total_batch = data.train.num_examples / batch_size
epochLimit = 51
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
restore(init_op, sess, saver)
epoch = global_step.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)
I also update the global step variable using assign at the bottom. Any ideas? Any help would be greatly appreciated.
When you call sess.run(init_op) after restoring this resets all variables to their initial values. Comment that line out and things should work.
My original code was wrong for several reasons because I was trying so many things. The first responder Alexandre Passos gives a valid point, but I believe what changed the game was also the use of scopes (maybe?).
Below is the working updated code if it helps anyone:
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
return saver, True, sess
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess.run(init_op)
return saver, False , sess
batch_size = 100
learning_rate = 0.0001
beta1 = 0.5
z_size = 100
save_interval = 1
data = dataset.read()
total_batch = data.train.num_examples / batch_size
def fill_queue():
for i in range(int(total_batch*epochLimit)):
sess.run(enqueue_op, feed_dict={batch: data.train.next_batch(batch_size)}) # runnig in seperate thread to feed a FIFOqueue
with tf.variable_scope("glob"):
global_step = tf.get_variable(name='global_step', initializer=0,trainable=False)
# build models
epochLimit = 51
saver = tf.train.Saver()
with tf.Session() as sess:
saver,rstr,sess = restore(sess, saver)
with tf.variable_scope("glob", reuse=True):
epocht = tf.get_variable(name='global_step', trainable=False, dtype=tf.int32)
epoch = epocht.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)

Run distributed tensorflow example with error

I have three node to run a distributed tensorflow, which is two worker(one has GPU,one not)and one ps(without GPU).The code is below:
from __future__ import print_function
import tensorflow as tf
import sys
import time
# cluster specification
parameter_servers = ["192.168.1.102:2222"]
workers = [ "192.168.1.103:2223",
"192.168.1.104:2224"]
cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
# start a server for a specific task
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
# config
batch_size = 100
learning_rate = 0.001
training_epochs = 20
logs_path = "/tmp/mnist/1"
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False)
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
tf.set_random_seed(1)
with tf.name_scope("weights"):
W1 = tf.Variable(tf.random_normal([784, 100]))
W2 = tf.Variable(tf.random_normal([100, 10]))
# bias
with tf.name_scope("biases"):
b1 = tf.Variable(tf.zeros([100]))
b2 = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
z2 = tf.add(tf.matmul(x,W1),b1)
a2 = tf.nn.sigmoid(z2)
z3 = tf.add(tf.matmul(a2,W2),b2)
y = tf.nn.softmax(z3)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
grad_op = tf.train.GradientDescentOptimizer(learning_rate)
train_op = grad_op.minimize(cross_entropy, global_step=global_step)
with tf.name_scope('Accuracy'):
# accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.scalar_summary("cost", cross_entropy)
tf.scalar_summary("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.merge_all_summaries()
init_op = tf.initialize_all_variables()
print("Variables initialized ...")
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op)
begin_time = time.time()
frequency = 100
with sv.prepare_or_wait_for_session(server.target) as sess:
# create log writer object (this will log on every machine)
writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
start_time = time.time()
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(mnist.train.num_examples/batch_size)
count = 0
for i in range(batch_count):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# perform the operations we defined earlier on batch
_, cost, summary, step = sess.run(
[train_op, cross_entropy, summary_op, global_step],
feed_dict={x: batch_x, y_: batch_y})
writer.add_summary(summary, step)
count += 1
if count % frequency == 0 or i+1 == batch_count:
elapsed_time = time.time() - start_time
start_time = time.time()
print("Step: %d," % (step+1),
" Epoch: %2d," % (epoch+1),
" Batch: %3d of %3d," % (i+1, batch_count),
" Cost: %.4f," % cost,
" AvgTime: %3.2fms" % float(elapsed_time*1000/frequency))
count = 0
print("Test-Accuracy: %2.2f" % sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
print("Total Time: %3.2fs" % float(time.time() - begin_time))
print("Final Cost: %.4f" % cost)
sv.stop()
print("done")
I run the above code on my three node with instruction below in terminal:
pc-01$ python example.py --job-name="ps" --task_index=0
pc-02$ python example.py --job-name="worker" --task_index=0
pc-03$ python example.py --job-name="worker" --task_index=1
However, after the Variables initialized, I met a question that the terminal of worker always print :
I tensor flow/core/distributed_runtime/master.cc:193] CreateSession still waiting for response from worker:/job:worker/replica:0/task:0
and the terminal of ps don't proceed.
The IP of ps is 192.168.1.102, and the IP of the worker is 192.168.1.103,192.168.1.104,just like the code above.
Anyone can help me?
I guess filtering out device should help here. Could you please try adding device_filter to your session ?
config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
with sv.prepare_or_wait_for_session(server.target, config=\config) as sess:
This should fix the issue.

Tensorflow Model not Restoring and Running Properly

I have built and successfully trained a convolutional model, however I cannot manage to restore the model and run my evaluation on it. The program throws a bunch of errors without giving an answer.
My code for the evaluation is here:
import tensorflow as tf
import main
import Process
import Input
eval_dir = "/Users/Zanhuang/Desktop/NNP/model.ckpt-98"
checkpoint_dir = "/Users/Zanhuang/Desktop/NNP/checkpoint"
def evaluate():
with tf.Graph().as_default() as g:
images, labels = Process.eval_inputs()
forward_propgation_results = Process.forward_propagation(images)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
top_k_op = tf.nn.in_top_k(forward_propgation_results, labels, 1)
with tf.Session(graph = g) as sess:
sess.run(init_op)
tf.train.start_queue_runners(sess=sess)
saver.restore(sess, eval_dir)
for i in range(100):
print(sess.run(top_k_op))
def main(argv = None):
evaluate()
if __name__ == '__main__':
tf.app.run()
My output to the eval looks looks like this: but without the program running. It just stays stuck there.
E tensorflow/core/client/tensor_c_api.cc:485] /Users/Zanhuang/Desktop/NNP/Prostate_Cancer_Data1.bin
[[Node: ReaderRead = ReaderRead[_class=["loc:#FixedLengthRecordReader", "loc:#input_producer"], _device="/job:localhost/replica:0/task:0/cpu:0"](FixedLengthRecordReader, input_producer)]]
ERROR:tensorflow:Exception in QueueRunner: /Users/Zanhuang/Desktop/NNP/Prostate_Cancer_Data1.bin
[[Node: ReaderRead = ReaderRead[_class=["loc:#FixedLengthRecordReader", "loc:#input_producer"], _device="/job:localhost/replica:0/task:0/cpu:0"](FixedLengthRecordReader, input_producer)]]
The following is the main part of my program where I save the model and the checkpoint files.
import Input
import Process
import time
import numpy as np
import tensorflow as tf
from datetime import datetime
FLAGS = tf.app.flags.FLAGS
def train():
with tf.Session() as sess:
images, labels = Process.inputs()
forward_propgation_results = Process.forward_propagation(images)
train_loss, cost = Process.error(forward_propgation_results, labels)
image_summary_t = tf.image_summary(images.name, images, max_images = 2)
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
saver = tf.train.Saver()
sess.run(init)
saver = tf.train.Saver(tf.all_variables())
tf.train.start_queue_runners(sess = sess)
train_dir = "/Users/Zanhuang/Desktop/NNP/model.ckpt"
summary_writer = tf.train.SummaryWriter(train_dir, sess.graph)
for step in range(100):
start_time = time.time()
print(sess.run([train_loss, cost]))
duration = time.time() - start_time
if step % 1 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, (%.1f examples/sec; %.3f ''sec/batch)')
print (format_str % (datetime.now(), step, examples_per_sec, sec_per_batch))
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 2 == 0:
checkpoint_path = train_dir
saver.save(sess, checkpoint_path, global_step = step)
def main(argv = None):
train()
if __name__ == '__main__':
tf.app.run()