Tensorflow results inconsistent between each freeze graph - tensorflow

When freezing a graph and then running it elsewhere (mobile device), the output is of low quality compared to the inference on the server on my semantic segmentation model. It is basically a messy version of what would run on the server. It is executing successfully, but it appears as though something was not initialized prior to freezing, even though the method to load the model between the export script and inference scripts is nearly identical.
The exported model can be run on the same images over and over and produce the same results for a given set of images, as expected.
However, each time the model is frozen, using the exact same script and checkpoint, it creates a different output for a given set of images.
def main():
args = get_arguments()
if args.dataset == 'cityscapes':
num_classes = cityscapes_class
else:
num_classes = ADE20k_class
shape = [320, 320]
x = tf.placeholder(dtype=tf.float32, shape=(shape[0], shape[1], 3), name="input")
img_tf = preprocess(x)
model = model_config[args.model]
net = model({'data': img_tf}, num_classes=num_classes, filter_scale=args.filter_scale)
raw_output = net.layers['conv6_cls']
raw_output_up = tf.image.resize_bilinear(raw_output, size=shape, align_corners=True)
raw_output_maxed = tf.argmax(raw_output_up, axis=3, name="output")
# Init tf Session
config = tf.ConfigProto()
sess = tf.Session(config=config)
init = tf.global_variables_initializer()
sess.run(init)
model_path = model_paths[args.model]
ckpt = tf.train.get_checkpoint_state(model_path)
if ckpt and ckpt.model_checkpoint_path:
input_checkpoint = ckpt.model_checkpoint_path
loader = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
load(loader, sess, ckpt.model_checkpoint_path)
else:
print('No checkpoint file found at %s.' % model_path)
exit()
print("Loaded Model")
# We retrieve the protobuf graph definition
graph = tf.get_default_graph()
input_graph_def = graph.as_graph_def()
# We use a built-in TF helper to export variables to constants
output_graph_def = graph_util.convert_variables_to_constants(
sess, # The session is used to retrieve the weights
input_graph_def, # The graph_def is used to retrieve the nodes
output_node_names.split(",") # The output node names are used to select the usefull nodes
)
# Finally we serialize and dump the output graph to the filesystem
with tf.gfile.GFile("model/output_graph.pb", "wb") as f:
f.write(output_graph_def.SerializeToString())
print("%d ops in the final graph." % len(output_graph_def.node))

Related

how to restore variables in fully_connected function

In my training file(train.py), I write:
def deep_part(self):
with tf.variable_scope("deep-part"):
y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.factor_size]) # None * (F*K)
# self.deep_layers = 2
for i in range(0,len(self.deep_layers)):
y_deep = tf.contrib.layers.fully_connected(y_deep, self.deep_layers[i], \
activation_fn=self.deep_layers_activation, scope = 'fc%d' % i)
return y_deep
now in predict file(predict.py), I restore the checkpoint, but I dont know how to reload the "deep-part" network's weights and biases.Because I think the "fully_conncted" function might hide the weights and biases.
I wrote a lengthy explanation here. A short summary:
By saver.save(sess, '/tmp/my_model') Tensorflow produces multiple files:
checkpoint
my_model.data-00000-of-00001
my_model.index
my_model.meta
The checkpoint file checkpoint is just a pointer to the latest version of our model-weights and it is simply a plain text file containing
$ !cat /tmp/model/checkpoint
model_checkpoint_path: "/tmp/my_model"
all_model_checkpoint_paths: "/tmp/my_model"
The others are binary files containing the graph (.meta) and weights (.data*).
You can help yourself by running
import tensorflow as tf
import numpy as np
data = np.arange(9 * 1).reshape(1, 9).astype(np.float32)
plhdr = tf.placeholder(tf.float32, shape=[1, 9], name='input')
print plhdr.name
activation = tf.layers.dense(plhdr, 10, name='fc')
print activation.name
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
expected = sess.run(activation, {plhdr: data})
print expected
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, '/tmp/my_model')
tf.reset_default_graph()
with tf.Session() as sess:
# load the computation graph (the fully connected + placeholder)
loader = tf.train.import_meta_graph('/tmp/my_model.meta')
sess.run(tf.global_variables_initializer())
plhdr = tf.get_default_graph().get_tensor_by_name('input:0')
activation = tf.get_default_graph().get_tensor_by_name('fc/BiasAdd:0')
actual = sess.run(activation, {plhdr: data})
assert np.allclose(actual, expected) is False
# now load the weights
loader = loader.restore(sess, '/tmp/my_model')
actual = sess.run(activation, {plhdr: data})
assert np.allclose(actual, expected) is True

Tensorflow, use import_graph_def() to load model error

I attempt to read an RNN network through import_graph_def() and do inference.
But I cannot use tf.trainable_variables() to get any variables.
In the following code, tf.trainable_variables() returns [] (a list with nothing)
Also, when I use saver = tf.train.Saver(), tensorflow reports "no variables to save"
def eval_on_test(graph_path):
batch_size = 80
train_begin = 0
train_end = 3000
with tf.Graph().as_default() as graph:
with open(graph_path, 'rb') as f:
tf_graph = tf.GraphDef()
print("Loading graph_def from {}".format(graph_path))
tf_graph.ParseFromString(f.read())
return_elements = tf.import_graph_def(tf_graph, name="", return_elements=['input_x:0', 'output_y:0', 'pred:0', 'loss:0'])
X = return_elements[0]
Y = return_elements[1]
pred = return_elements[2]
loss = return_elements[3]
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
print("graph loaded, start testing")
with tf.Session(config=tf_config) as sess:
init_op = sess.graph.get_operation_by_name('init')
sess.run(init_op)
print(tf.trainable_variables())
batch_index,train_x,train_y=get_train_data(batch_size,time_step,train_begin,train_end)
for batch in range(len(batch_index)-1):
loss_ = sess.run(loss, feed_dict={X:train_x[batch_index[batch]:batch_index[batch+1]],Y:train_y[batch_index[batch]:batch_index[batch+1]]})
print(batch, loss_)
Any help would be appreciated.
import_graph_def will only restore graph but will not restore collections such as GLOBAL_VARIABLES, that's why Saver can't find any variables in graph, to solve this, you can try tf.train.import_meta_graph which will also restore all collections.

In Tensorflow, how to use a restored meta-graph if the meta graph was feeding with TFRecord input (without placeholders)

I trained a network with TFRecord input pipeline. In other words, there was no placeholders. Simple example would be:
input, truth = _get_next_batch() # TFRecord. `input` is not a tf.placeholder
net = Model(input)
net.set_loss(truth)
optimizer = tf...(net.loss)
Let's say, I acquired three files, ckpt-20000.meta, ckpt-20000.data-0000-of-0001, ckpt-20000.index. I understood that, later one can import the meta-graph using the .meta file and access tensors such as:
new_saver = tf.train.import_meta_graph('ckpt-20000.meta')
new_saver.restore(sess, 'ckpt-20000')
logits = tf.get_collection("logits")[0]
However, the meta-graph does not have a placeholder from the beginning in the pipeline. Is there a way that I can use meta-graph and query inference of an input?
For information, in a query application (or a script), I used to define a model with a placeholder and restored model weights (see below). I am wondering if I can just utilize the meta-graph without re-definition since it would be much more simple.
input = tf.placeholder(...)
net = Model(input)
tf.restore(sess, 'ckpt-2000')
lgt = sess.run(net.logits, feed_dict = {input:img})
You can build a graph that uses placeholder_with_default() for the inputs, so can use both TFRecord input pipeline as well as feed_dict{}.
An example:
input, truth = _get_next_batch()
_x = tf.placeholder_with_default(input, shape=[...], name='input')
_y = tf.placeholder_with_default(truth, shape-[...], name='label')
net = Model(_x)
net.set_loss(_y)
optimizer = tf...(net.loss)
Then during inference,
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
new_saver = tf.train.import_meta_graph('ckpt-20000.meta')
new_saver.restore(sess, 'ckpt-20000')
# Get the tensors by their variable name
input = loaded_graph.get_tensor_by_name('input:0')
logits = loaded_graph.get_tensor_by_name(...)
# Now you can feed the inputs to your tensors
lgt = sess.run(logits, feed_dict = {input:img})
In the above example, if you don't feed input, then the input will be read from the TFRecord input pipeline.
Is there a way to do it without placeholders at test though? It should be possible to re-use the graph with a new input pipeline without resorting to slow placeholders (i.e. the test dataset may be very large). placeholder_with_default is a suboptimal solution in that case.
The recommended way is saving two meta graphs. One is for Training/Validation/Testing, and the other one is for inference.
see Building a SavedModel
export_dir = ...
...
builder = tf.saved_model_builder.SavedModelBuilder(export_dir)
with tf.Session(graph=tf.Graph()) as sess:
...
builder.add_meta_graph_and_variables(sess,
[tag_constants.TRAINING],
signature_def_map=foo_signatures,
assets_collection=foo_assets)
...
# Add a second MetaGraphDef for inference.
with tf.Session(graph=tf.Graph()) as sess:
...
builder.add_meta_graph([tag_constants.SERVING])
...
builder.save()
The NMT tutorial also provides a detailed example about creating multiple graphs with shared variables: Neural Machine Translation (seq2seq) Tutorial-Building Training, Eval, and Inference Graphs
train_graph = tf.Graph()
eval_graph = tf.Graph()
infer_graph = tf.Graph()
with train_graph.as_default():
train_iterator = ...
train_model = BuildTrainModel(train_iterator)
initializer = tf.global_variables_initializer()
with eval_graph.as_default():
eval_iterator = ...
eval_model = BuildEvalModel(eval_iterator)
with infer_graph.as_default():
infer_iterator, infer_inputs = ...
infer_model = BuildInferenceModel(infer_iterator)
checkpoints_path = "/tmp/model/checkpoints"
train_sess = tf.Session(graph=train_graph)
eval_sess = tf.Session(graph=eval_graph)
infer_sess = tf.Session(graph=infer_graph)
train_sess.run(initializer)
train_sess.run(train_iterator.initializer)
for i in itertools.count():
train_model.train(train_sess)
if i % EVAL_STEPS == 0:
checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=i)
eval_model.saver.restore(eval_sess, checkpoint_path)
eval_sess.run(eval_iterator.initializer)
while data_to_eval:
eval_model.eval(eval_sess)
if i % INFER_STEPS == 0:
checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=i)
infer_model.saver.restore(infer_sess, checkpoint_path)
infer_sess.run(infer_iterator.initializer, feed_dict={infer_inputs: infer_input_data})
while data_to_infer:
infer_model.infer(infer_sess)

Tensorflow session and graph context

My question is about context and the TensorFlow default sessions and graph.
The problem:
Tensorflow is unable to feed a placeholder in the following scenario:
Function Test defines a graph.
Function Test_Once defines a session.
When Function Test calls Test_Once -> Feeding fails.
When I change the code so function Test declares the graph + the session -> all is working.
Here is the code:
def test_once(g, saver, summary_writer, logits, images, summary_op):
"""Run a session once for a givven test image.
Args:
saver: Saver.
summary_writer: Summary writer.
logits:
summary_op: Summary op.
"""
with tf.Session(graph=g) as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
else:
print('No checkpoint file found')
return
images.astype(np.float32)
predictions = sess.run(logits, feed_dict={'InputPlaceHolder/TestInput:0':images})
summary = tf.Summary()
summary.ParseFromString(sess.run(summary_op))
summary_writer.add_summary(summary, global_step)
return (predictions)
def test():
"""Test LCPR with a test image"""
with tf.Graph().as_default() as g:
# Get image for testing
images, labels = lcpr.test_input()
# Build a Graph that computes the logits predictions from the
# inference model.
with tf.name_scope('InputPlaceHolder'):
test_image_placeholder = tf.placeholder(tf.float32, (None,None,None,3), 'TestInput')
# Display the training images in the visualizer.
# The 'max_outputs' default is 3. Not stated. (Max number of batch elements to generate images for.)
#tf.summary.image('input_images', test_image_placeholder)
with tf.name_scope('Inference'):
logits = lcpr.inference(test_image_placeholder)
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(
lcpr.MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
# Build the summary operation based on the TF collection of Summaries.
writer = tf.summary.FileWriter("/tmp/lcpr/test")
writer.add_graph(g)
summary_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(FLAGS.test_dir, g)
#Sadly, this will not work:
predictions = test_once(g, saver, summary_writer, logits, images, summary_op)
'''Alternative working option :
with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
# Assuming model_checkpoint_path looks something like:
# /my-favorite-path/cifar10_train/model.ckpt-0,
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
else:
print('No checkpoint file found')
return
x = sess.run(logits, feed_dict={'InputPlaceHolder/TestInput:0':images})
print(x)
'''
The above code yeilds an error that the placeholder is not fed:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'InputPlaceHolder/TestInput' with dtype float
And it's not that TensorFlow does not recognize the placeholder. If I change the name from 'InputPlaceHolder/TestInput:0' to 'InputPlaceHolder/TestInput:1' I receive a message calming that 'InputPlaceHolder/TestInput' exists but has only 1 output. This makes sense, and I guess the session runs on my default graph.
Things only work for me if I stay within the same def:
If I change the code by running the commented part (starting ' with tf.Session() as sess:) directly from within the first function all works.
I wonder what am I missing?
My guess that is context related, maybe not assigning the session to the graph?
Solved. Stupid mistake
test_once calls sess.run twice. On the second time, indeed no placeholder is fed.... : summary.ParseFromString(sess.run(summary_op))

multiple runs of inference model using saved checkpoint produces stochastic errors - Tensorflow

I am running Tensorflow 0.12.1 on a GPU. I have a trained Deep CNN model whose weights I've saved using a checkpoint file. During inference, I reload the saved checkpoint using restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir)). The code seems to run without issues, but everytime I re-run the script, I'm getting screwed up outputs. AFAIK, I do not shuffle my test set inputs. The inputs are being loaded and fed to the network properly. It is just the output of different runs of the CNN on the same test set using the same order is producing very different outputs. I'm perplexed! Also, how do I execute a graph loaded with saved checkpoint without running an init_op during inference? It seems my code requires all global and local variables to be initialized before execution. (I initialize first, and then only restore the checkpoint!).Here's a snippet of my code:
import tensorflow as tf
import numpy as np
import os
import os.path
from datetime import datetime
import time
import random
import json
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
from modelFCNN3 import model
def read_input(inp_queue,height=224,width=224,channels=3, mask=False):
value = tf.read_file(inp_queue)
image = tf.image.decode_png(value)
image = tf.image.resize_images(image, [height, width],method=2)
image = tf.cast(image, tf.uint8)
image.set_shape([height,width,channels])
image = tf.reshape(image,[height,width,channels])
if mask:
image = tf.to_float(tf.greater_equal(image,128))
image = tf.cast(image,tf.float32)
else:
image = tf.image.per_image_standardization(image)
image = tf.cast(image,tf.float32)
return image
if __name__ == '__main__':
tf.reset_default_graph()
with open('X_test.json', 'r') as infile:
X_test = json.load(infile)
with open('y_test.json', 'r') as infile:
y_test = json.load(infile)
imagelist = ops.convert_to_tensor(X_test, dtype=dtypes.string)
labellist = ops.convert_to_tensor(y_test, dtype=dtypes.string)
input_queue = tf.train.slice_input_producer([imagelist, labellist],
num_epochs=1,
shuffle=False)
image = read_input(input_queue[0],height=224,width=224,channels=3, mask=False)
label = read_input(input_queue[1],height=224,width=224,channels=1, mask=True)
images_batch, labels_batch = tf.train.batch([image, label], batch_size=FLAGS.batch_size,
enqueue_many=False,shapes=None, allow_smaller_final_batch=True)
global_step = tf.Variable(0, trainable=False)
images = tf.placeholder_with_default(images_batch, shape=[None, 224,224,3])
labels = tf.placeholder_with_default(labels_batch, shape=[None, 224,224,1])
restorer = tf.train.Saver()
logits = model(images).logits
labels = tf.cast(labels,tf.int32)
labels.set_shape([FLAGS.batch_size,224,224,1])
valid_prediction = tf.argmax(tf.nn.softmax(logits), dimension=3)
valid_prediction.set_shape([FLAGS.batch_size,224,224])
meanIOU,update_op_mIOU= tf.contrib.metrics.streaming_mean_iou(tf.cast(valid_prediction,tf.int32), tf.squeeze(labels),FLAGS.num_classes)
init = tf.global_variables_initializer()
init_locals = tf.local_variables_initializer()
with tf.Session() as sess:
sess.run([init, init_locals])
restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir))
print("Model restored.")
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord,sess=sess)
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
try:
step = 0
avg = []
while not coord.should_stop():
myimg, predimg, mylbl= sess.run([images,valid_prediction,labels])
mIOU,_ = sess.run([meanIOU,update_op_mIOU])
avg.append(mIOU)
step += 1
except tf.errors.OutOfRangeError:
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
sess.close()
Are you running on same machine or different machine
#saver = tf.train.Saver()
The following comment is in tensorflow docs
#NOTE: Restarting training from saved meta_graph only works if the device assignments have not changed.
#saver = tf.train.import_meta_graph(metafile)