I have built and trained a tensorflow model, but unfortunately, the checkpoint file cannot be open as shown below by an error.
Now there isn't an error, but a bunch of warnings that don't really tell you anything.
This happens when I run the evalutation code:
import tensorflow as tf
import main
import Process
import Input
eval_dir = "/Users/Zanhuang/Desktop/NNP"
checkpoint_dir = "/Users/Zanhuang/Desktop/NNP"
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
def evaluate():
with tf.Graph().as_default() as g:
images, labels = Process.eval_inputs()
forward_propgation_results = Process.forward_propagation(images)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
top_k_op = tf.nn.in_top_k(forward_propgation_results, labels, 1)
with tf.Session(graph = g) as sess:
tf.train.start_queue_runners(sess = sess)
sess.run(init_op)
saver.restore(sess, eval_dir)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
for i in range(100):
print(sess.run(top_k_op))
def main(argv = None):
evaluate()
if __name__ == '__main__':
tf.app.run()
The next is how I had generated the checkpoint file:
if step % 2 == 0:
checkpoint_path = os.path.join(FLAGS.data_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = step)
Related
I'm trying to save and restore parameters of a tensorflow model. The code does save the parameters to the given path, but when I try to restore the last checkpoint, perform an operation, and then save again, it doesn't return the last checkpoint. what should I do?
import tensorflow as tf
import os
v1 = tf.Variable(1.32, name="v1")
v2 = tf.Variable(1.33, name="v2")
saver = tf.train.Saver()
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
print (v2.eval(sess))
saver.save(sess, "/tmp/model")
print("Model restored.")
for i in range(10):
ckpt = tf.train.get_checkpoint_state(os.path.dirname('/tmp/'))
print(ckpt)
sess = tf.Session()
if(ckpt and ckpt.model_checkpoint_path):
saver.restore(sess, ckpt.model_checkpoint_path)
x = sess.run("v2:0")
x = x + 1
print(x)
saver.save(sess, "/tmp/model", global_step = i)
sess.close()
I attempt to read an RNN network through import_graph_def() and do inference.
But I cannot use tf.trainable_variables() to get any variables.
In the following code, tf.trainable_variables() returns [] (a list with nothing)
Also, when I use saver = tf.train.Saver(), tensorflow reports "no variables to save"
def eval_on_test(graph_path):
batch_size = 80
train_begin = 0
train_end = 3000
with tf.Graph().as_default() as graph:
with open(graph_path, 'rb') as f:
tf_graph = tf.GraphDef()
print("Loading graph_def from {}".format(graph_path))
tf_graph.ParseFromString(f.read())
return_elements = tf.import_graph_def(tf_graph, name="", return_elements=['input_x:0', 'output_y:0', 'pred:0', 'loss:0'])
X = return_elements[0]
Y = return_elements[1]
pred = return_elements[2]
loss = return_elements[3]
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
print("graph loaded, start testing")
with tf.Session(config=tf_config) as sess:
init_op = sess.graph.get_operation_by_name('init')
sess.run(init_op)
print(tf.trainable_variables())
batch_index,train_x,train_y=get_train_data(batch_size,time_step,train_begin,train_end)
for batch in range(len(batch_index)-1):
loss_ = sess.run(loss, feed_dict={X:train_x[batch_index[batch]:batch_index[batch+1]],Y:train_y[batch_index[batch]:batch_index[batch+1]]})
print(batch, loss_)
Any help would be appreciated.
import_graph_def will only restore graph but will not restore collections such as GLOBAL_VARIABLES, that's why Saver can't find any variables in graph, to solve this, you can try tf.train.import_meta_graph which will also restore all collections.
I have read many similar questions and just cannot get this to work properly.
I have my model being trained well and checkpoint files are being made every epoch. I want to have it so the program can continue from epoch x once reloaded and also for it to print that is on that epoch with every iteration. I could simply save the data outside of the checkpoint file, however I was also wanting to do this to give me confidence everything else is also being stored properly.
Unfortunately the value in the epoch/global_step variable is always still 0 when I restart.
import tensorflow as tf
import numpy as np
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(init_op, sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
sess.run(init_op)
return
with tf.Graph().as_default() as g:
# build models
total_batch = data.train.num_examples / batch_size
epochLimit = 51
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
restore(init_op, sess, saver)
epoch = global_step.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)
I also update the global step variable using assign at the bottom. Any ideas? Any help would be greatly appreciated.
When you call sess.run(init_op) after restoring this resets all variables to their initial values. Comment that line out and things should work.
My original code was wrong for several reasons because I was trying so many things. The first responder Alexandre Passos gives a valid point, but I believe what changed the game was also the use of scopes (maybe?).
Below is the working updated code if it helps anyone:
import tensorflow as tf
import numpy as np
# more imports
def extract_number(f): # used to get latest checkpint file
s = re.findall("epoch(\d+).ckpt",f)
return (int(s[0]) if s else -1,f)
def restore(sess, saver): # called to restore or just initialise model
list = glob(os.path.join("./params/e*"))
if list:
file = max(list,key=extract_number)
saver.restore(sess, file[:-5])
return saver, True, sess
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess.run(init_op)
return saver, False , sess
batch_size = 100
learning_rate = 0.0001
beta1 = 0.5
z_size = 100
save_interval = 1
data = dataset.read()
total_batch = data.train.num_examples / batch_size
def fill_queue():
for i in range(int(total_batch*epochLimit)):
sess.run(enqueue_op, feed_dict={batch: data.train.next_batch(batch_size)}) # runnig in seperate thread to feed a FIFOqueue
with tf.variable_scope("glob"):
global_step = tf.get_variable(name='global_step', initializer=0,trainable=False)
# build models
epochLimit = 51
saver = tf.train.Saver()
with tf.Session() as sess:
saver,rstr,sess = restore(sess, saver)
with tf.variable_scope("glob", reuse=True):
epocht = tf.get_variable(name='global_step', trainable=False, dtype=tf.int32)
epoch = epocht.eval()
while epoch < epochLimit:
total_batch = data.train.num_examples / batch_size
for i in range(int(total_batch)):
sys.stdout.flush()
voxels = newData.eval()
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
sess.run(opt_G, feed_dict={z:batch_z, train:True})
sess.run(opt_D, feed_dict={input:voxels, z:batch_z, train:True})
with open("out/loss.csv", 'a') as f:
batch_loss_G = sess.run(loss_G, feed_dict={z:batch_z, train:False})
batch_loss_D = sess.run(loss_D, feed_dict={input:voxels, z:batch_z, train:False})
msgOut = "Epoch: [{0}], i: [{1}], G_Loss[{2:.8f}], D_Loss[{3:.8f}]".format(epoch, i, batch_loss_G, batch_loss_D)
print(msgOut)
epoch=epoch+1
sess.run(global_step.assign(epoch))
saver.save(sess, "params/epoch{0}.ckpt".format(epoch))
batch_z = np.random.uniform(-1, 1, [batch_size, z_size]).astype(np.float32)
voxels = sess.run(x_, feed_dict={z:batch_z})
v = voxels[0].reshape([32, 32, 32]) > 0
util.save_binvox(v, "out/epoch{0}.vox".format(epoch), 32)
I am running Tensorflow 0.12.1 on a GPU. I have a trained Deep CNN model whose weights I've saved using a checkpoint file. During inference, I reload the saved checkpoint using restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir)). The code seems to run without issues, but everytime I re-run the script, I'm getting screwed up outputs. AFAIK, I do not shuffle my test set inputs. The inputs are being loaded and fed to the network properly. It is just the output of different runs of the CNN on the same test set using the same order is producing very different outputs. I'm perplexed! Also, how do I execute a graph loaded with saved checkpoint without running an init_op during inference? It seems my code requires all global and local variables to be initialized before execution. (I initialize first, and then only restore the checkpoint!).Here's a snippet of my code:
import tensorflow as tf
import numpy as np
import os
import os.path
from datetime import datetime
import time
import random
import json
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
from modelFCNN3 import model
def read_input(inp_queue,height=224,width=224,channels=3, mask=False):
value = tf.read_file(inp_queue)
image = tf.image.decode_png(value)
image = tf.image.resize_images(image, [height, width],method=2)
image = tf.cast(image, tf.uint8)
image.set_shape([height,width,channels])
image = tf.reshape(image,[height,width,channels])
if mask:
image = tf.to_float(tf.greater_equal(image,128))
image = tf.cast(image,tf.float32)
else:
image = tf.image.per_image_standardization(image)
image = tf.cast(image,tf.float32)
return image
if __name__ == '__main__':
tf.reset_default_graph()
with open('X_test.json', 'r') as infile:
X_test = json.load(infile)
with open('y_test.json', 'r') as infile:
y_test = json.load(infile)
imagelist = ops.convert_to_tensor(X_test, dtype=dtypes.string)
labellist = ops.convert_to_tensor(y_test, dtype=dtypes.string)
input_queue = tf.train.slice_input_producer([imagelist, labellist],
num_epochs=1,
shuffle=False)
image = read_input(input_queue[0],height=224,width=224,channels=3, mask=False)
label = read_input(input_queue[1],height=224,width=224,channels=1, mask=True)
images_batch, labels_batch = tf.train.batch([image, label], batch_size=FLAGS.batch_size,
enqueue_many=False,shapes=None, allow_smaller_final_batch=True)
global_step = tf.Variable(0, trainable=False)
images = tf.placeholder_with_default(images_batch, shape=[None, 224,224,3])
labels = tf.placeholder_with_default(labels_batch, shape=[None, 224,224,1])
restorer = tf.train.Saver()
logits = model(images).logits
labels = tf.cast(labels,tf.int32)
labels.set_shape([FLAGS.batch_size,224,224,1])
valid_prediction = tf.argmax(tf.nn.softmax(logits), dimension=3)
valid_prediction.set_shape([FLAGS.batch_size,224,224])
meanIOU,update_op_mIOU= tf.contrib.metrics.streaming_mean_iou(tf.cast(valid_prediction,tf.int32), tf.squeeze(labels),FLAGS.num_classes)
init = tf.global_variables_initializer()
init_locals = tf.local_variables_initializer()
with tf.Session() as sess:
sess.run([init, init_locals])
restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir))
print("Model restored.")
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord,sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
try:
step = 0
avg = []
while not coord.should_stop():
myimg, predimg, mylbl= sess.run([images,valid_prediction,labels])
mIOU,_ = sess.run([meanIOU,update_op_mIOU])
avg.append(mIOU)
step += 1
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
sess.close()
Are you running on same machine or different machine
#saver = tf.train.Saver()
The following comment is in tensorflow docs
#NOTE: Restarting training from saved meta_graph only works if the device assignments have not changed.
#saver = tf.train.import_meta_graph(metafile)
I have built and successfully trained a convolutional model, however I cannot manage to restore the model and run my evaluation on it. The program throws a bunch of errors without giving an answer.
My code for the evaluation is here:
import tensorflow as tf
import main
import Process
import Input
eval_dir = "/Users/Zanhuang/Desktop/NNP/model.ckpt-98"
checkpoint_dir = "/Users/Zanhuang/Desktop/NNP/checkpoint"
def evaluate():
with tf.Graph().as_default() as g:
images, labels = Process.eval_inputs()
forward_propgation_results = Process.forward_propagation(images)
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
top_k_op = tf.nn.in_top_k(forward_propgation_results, labels, 1)
with tf.Session(graph = g) as sess:
sess.run(init_op)
tf.train.start_queue_runners(sess=sess)
saver.restore(sess, eval_dir)
for i in range(100):
print(sess.run(top_k_op))
def main(argv = None):
evaluate()
if __name__ == '__main__':
tf.app.run()
My output to the eval looks looks like this: but without the program running. It just stays stuck there.
E tensorflow/core/client/tensor_c_api.cc:485] /Users/Zanhuang/Desktop/NNP/Prostate_Cancer_Data1.bin
[[Node: ReaderRead = ReaderRead[_class=["loc:#FixedLengthRecordReader", "loc:#input_producer"], _device="/job:localhost/replica:0/task:0/cpu:0"](FixedLengthRecordReader, input_producer)]]
ERROR:tensorflow:Exception in QueueRunner: /Users/Zanhuang/Desktop/NNP/Prostate_Cancer_Data1.bin
[[Node: ReaderRead = ReaderRead[_class=["loc:#FixedLengthRecordReader", "loc:#input_producer"], _device="/job:localhost/replica:0/task:0/cpu:0"](FixedLengthRecordReader, input_producer)]]
The following is the main part of my program where I save the model and the checkpoint files.
import Input
import Process
import time
import numpy as np
import tensorflow as tf
from datetime import datetime
FLAGS = tf.app.flags.FLAGS
def train():
with tf.Session() as sess:
images, labels = Process.inputs()
forward_propgation_results = Process.forward_propagation(images)
train_loss, cost = Process.error(forward_propgation_results, labels)
image_summary_t = tf.image_summary(images.name, images, max_images = 2)
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
saver = tf.train.Saver()
sess.run(init)
saver = tf.train.Saver(tf.all_variables())
tf.train.start_queue_runners(sess = sess)
train_dir = "/Users/Zanhuang/Desktop/NNP/model.ckpt"
summary_writer = tf.train.SummaryWriter(train_dir, sess.graph)
for step in range(100):
start_time = time.time()
print(sess.run([train_loss, cost]))
duration = time.time() - start_time
if step % 1 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, (%.1f examples/sec; %.3f ''sec/batch)')
print (format_str % (datetime.now(), step, examples_per_sec, sec_per_batch))
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 2 == 0:
checkpoint_path = train_dir
saver.save(sess, checkpoint_path, global_step = step)
def main(argv = None):
train()
if __name__ == '__main__':
tf.app.run()