cloud ml engine version create error 3 - tensorflow-serving

I copied and used this code. url : http://fdahms.com/2017/03/05/tensorflow-serving-jvm-client/
but An error occurs when deploying a version.
Model validation failed: Serving metagraph must contain exactly one SignatureDef with key: serving_default
I was trying to fix the code with reference to https://github.com/tensorflow/serving/blob/master/tensorflow_serving/example/mnist_saved_model.py
import tensorflow as tf
x = tf.placeholder(tf.float32, shape=(None))
y = tf.placeholder(tf.float32, shape=(None))
three = tf.Variable(3, dtype= tf.float32)
z = tf.scalar_mul(three, x) + y
import os
from tensorflow.python.util import compat
model_version = 1
path = os.path.join("three_x_plus_y", str(model_version))
builder = tf.saved_model.builder.SavedModelBuilder(path)
legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
tensor_info_y = tf.saved_model.utils.build_tensor_info(y)
tensor_info_z = tf.saved_model.utils.build_tensor_info(z)
prediction_signature = (
tf.saved_model.signature_def_utils.build_signature_def(
inputs= {'egg': tensor_info_x, 'bacon': tensor_info_y},
outputs= {'spam': tensor_info_z},
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
builder.add_meta_graph_and_variables(
sess,[tf.saved_model.tag_constants.SERVING],
signature_def_map= {
"magic_model": prediction_signature},
legacy_init_op=legacy_init_op
)
builder.save()
But I got the same error...
i'm using the 'Google Cloud Machine Running Engine'
i need help ..thank you for reading.

Change the key in the signature_def_map from magic_model to serving_default.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
builder.add_meta_graph_and_variables(
sess,[tf.saved_model.tag_constants.SERVING],
signature_def_map= {
"serving_default": prediction_signature},
legacy_init_op=legacy_init_op
)

Related

Tensorflow freeze_graph unable to initialize local_variables

When freezing a graph with a local variable, freeze_graph has an error stating "Attempting to use uninitialized value...". The local variable in question was initialized via:
with tf.variable_scope(tf.get_variable_scope(),reuse=tf.AUTO_REUSE):
b_init = tf.constant(10.0, shape=[2, 1], dtype="float32",name = 'bi')
b = tf.get_variable('b',initializer=b_init,collections=[tf.GraphKeys.LOCAL_VARIABLES])
I'm able to create a saved model and run the saved model. However, I'm trying to freeze another graph for optimization. This error will go away if I remove the 'LOCAL_VARIABLES' flag. However, this variable then becomes global, which causes an issue with reloading my checkpoint (Tensorflow is unable to find the variable in the checkpoint).
Normally, I'd expect freeze_graph to initialize 'b' using 'b_init'.
Code to reproduce the issue:
import os, sys, json
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from tensorflow.core.framework import variable_pb2
from tensorflow.python.framework import ops
from tensorflow.python.ops import variables
from tensorflow.python.framework.ops import register_proto_function
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.tools import freeze_graph
from tensorflow.python import ops
from tensorflow.tools.graph_transforms import TransformGraph
#flags
tf.app.flags.DEFINE_integer('model_version',1,'Models version number.')
tf.app.flags.DEFINE_string('export_model_dir','../model_batch/versions', 'Directory where model will be exported to')
FLAGS = tf.app.flags.FLAGS
def main(_):
''' main function'''
a = tf.placeholder(dtype = tf.float32, shape = [2,1])
with tf.variable_scope(tf.get_variable_scope(),reuse=tf.AUTO_REUSE):
b_init = tf.constant(10.0, shape=[2, 1], dtype="float32",name = 'bi')
b = tf.get_variable('b',initializer=b_init,collections=[tf.GraphKeys.LOCAL_VARIABLES])
b = tf.assign(b,a)
c = []
for d in range(5):
b = b * 1.1
c.append(b)
c = tf.identity(c,name = 'c')
init = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
with tf.Session() as sess:
#init
sess.run(init)
print(tf.get_default_graph().get_collection(tf.GraphKeys.LOCAL_VARIABLES))
#create saved model builder class
export_path_base = FLAGS.export_model_dir
export_path = os.path.join(
tf.compat.as_bytes(export_path_base),
tf.compat.as_bytes(str(FLAGS.model_version)))
if tf.gfile.Exists(export_path):
print ('Removing previous artifacts')
tf.gfile.DeleteRecursively(export_path)
#inputs
tensor_info_a = tf.saved_model.utils.build_tensor_info(a)
#outputs
tensor_info_c = tf.saved_model.utils.build_tensor_info(c)
print('Exporting trained model to', export_path)
builder = tf.saved_model.builder.SavedModelBuilder(export_path)
#define signatures
prediction_signature = (
tf.saved_model.signature_def_utils.build_signature_def(
inputs={'cameras': tensor_info_a},
outputs = {'depthmap' : tensor_info_c},
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
builder.add_meta_graph_and_variables(
sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map = {'predict_batch': prediction_signature})
#export model
builder.save(as_text=True)
writer = tf.summary.FileWriter("output_batch", sess.graph)
writer.close()
#load graph from saved model
print ('Freezing graph')
initializer_nodes = ''
output_node_names = 'c'
saved_model_dir = os.path.join(FLAGS.export_model_dir,str(FLAGS.model_version))
output_graph_filename = os.path.join(saved_model_dir,'frozen_graph.pb')
freeze_graph.freeze_graph(
input_saved_model_dir=saved_model_dir,
output_graph=output_graph_filename,
saved_model_tags = tag_constants.SERVING,
output_node_names=output_node_names,
initializer_nodes=initializer_nodes,
input_graph=None,
input_saver=False,
input_binary=False,
input_checkpoint=None,
restore_op_name=None,
filename_tensor_name=None,
clear_devices=False)
if __name__ == '__main__':
tf.app.run()
I wasn't able to include local_variables in my frozen graph, but I did come up with a work around.
The initial problem was that my checkpoint was created from a graph that contained local_variables. Unfortunately, freezing the graph produced the error:
Attempting to use uninitialized value...
What I did to work-around the issue was to change the local variables to untrainable global variables. I then filtered out the global variables not in my checkpoint using the following solution:
https://stackoverflow.com/a/39142780/6693924
I'm able to create a savedModel and freeze its graph.

When restoring a network, an operation cannot be found in the restored graph

Using TensorFlow 1.9, I want to train a neural network in one Python file, and then restore the network using a different Python file. I have tried to do this using a simple example, but when I try to load my "prediction" operation, I receive an error. Specifically, the error is: KeyError: "The name 'prediction' refers to an Operation not in the graph.".
Below is my Python file to train and save the network. It generates some example data and trains a simple neural network, then saves the network every epoch.
import numpy as np
import tensorflow as tf
input_data = np.zeros([100, 10])
label_data = np.zeros([100, 1])
for i in range(100):
for j in range(10):
input_data[i, j] = i * j / 1000
label_data[i] = 2 * input_data[i, 0] + np.random.uniform(0.01)
input_placeholder = tf.placeholder(tf.float32, shape=[None, 10], name='input_placeholder')
label_placeholder = tf.placeholder(tf.float32, shape=[None, 1], name='label_placeholder')
x = tf.layers.dense(inputs=input_placeholder, units=10, activation=tf.nn.relu)
x = tf.layers.dense(inputs=x, units=10, activation=tf.nn.relu)
prediction = tf.layers.dense(inputs=x, units=1, name='prediction')
loss_op = tf.reduce_mean(tf.square(prediction - label_placeholder))
train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss_op)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch_num in range(100):
_, loss = sess.run([train_op, loss_op], feed_dict={input_placeholder: input_data, label_placeholder: label_data})
print('epoch ' + str(epoch_num) + ', loss = ' + str(loss))
saver.save(sess, '../Models/model', global_step=epoch_num + 1)
And below is my Python file to restore the network. It loads the input and output placeholders, together with the operation required for making predictions. However, even though I have named an operation as prediction in the training code above, the code below cannot seem to find this operation in the loaded graph.
import tensorflow as tf
import numpy as np
input_data = np.zeros([100, 10])
for i in range(100):
for j in range(10):
input_data[i, j] = i * j / 1000
with tf.Session() as sess:
saver = tf.train.import_meta_graph('../Models/model-99.meta')
saver.restore(sess, '../Models/model-99')
graph = tf.get_default_graph()
input_placeholder = graph.get_tensor_by_name('input_placeholder:0')
label_placeholder = graph.get_tensor_by_name('label_placeholder:0')
prediction = graph.get_operation_by_name('prediction')
pred = sess.run([prediction], feed_dict={input_placeholder: input_data})
Why can this code not find this operation, and what should I do to correct my code?
You have to modify a single line in your loading script (tested with tf 1.8):
prediction = graph.get_tensor_by_name('prediction/BiasAdd:0')
You have to specify which tensor you want to access, as prediction is only the namespace for the dense layer. You can check the exact name during saving with prediction.name. And when restoring, use tf.get_tensor_by_name as you are interested in the value, not the operation producing it.

convert tensor to numpy array

I write the following code for extract features from two images with deep CNN usinf tensorflow:
# -*- coding: utf-8 -*-
# Implementation of Wang et al 2017: Automatic Brain Tumor Segmentation using Cascaded Anisotropic Convolutional Neural Networks. https://arxiv.org/abs/1709.00382
# Author: Guotai Wang
# Copyright (c) 2017-2018 University College London, United Kingdom. All rights reserved.
# http://cmictig.cs.ucl.ac.uk
#
# Distributed under the BSD-3 licence. Please see the file licence.txt
# This software is not certified for clinical use.
#
from __future__ import absolute_import, print_function
import numpy as np
from scipy import ndimage
import time
import os
import sys
import pickle
import tensorflow as tf
from tensorflow.contrib.data import Iterator
from util.data_loader import *
from util.data_process import *
from util.train_test_func import *
from util.parse_config import parse_config
from train import NetFactory
print("import finished")
def test(config_file):
# 1, load configure file
config = parse_config(config_file)
config_data = config['data']
config_net1 = config.get('network1', None)
config_net2 = config.get('network2', None)
config_net3 = config.get('network3', None)
config_test = config['testing']
batch_size = config_test.get('batch_size', 5)
print("configure file loaded")
# 2.1, network for whole tumor
if(config_net1):
net_type1 = config_net1['net_type']
net_name1 = config_net1['net_name']
data_shape1 = config_net1['data_shape']
label_shape1 = config_net1['label_shape']
class_num1 = config_net1['class_num']
print("configure file of whole tumor is loaded")
# construct graph for 1st network
full_data_shape1 = [batch_size] + data_shape1
x1 = tf.placeholder(tf.float32, shape = full_data_shape1)
net_class1 = NetFactory.create(net_type1)
net1 = net_class1(num_classes = class_num1,w_regularizer = None,
b_regularizer = None, name = net_name1)
net1.set_params(config_net1)
predicty1, caty1 = net1(x1, is_training = True)
proby1 = tf.nn.softmax(predicty1)
else:
config_net1ax = config['network1ax']
config_net1sg = config['network1sg']
config_net1cr = config['network1cr']
print("configure files of whole tumor in three planes are loaded")
# construct graph for 1st network axial
net_type1ax = config_net1ax['net_type']
net_name1ax = config_net1ax['net_name']
data_shape1ax = config_net1ax['data_shape']
label_shape1ax = config_net1ax['label_shape']
class_num1ax = config_net1ax['class_num']
full_data_shape1ax = [batch_size] + data_shape1ax
x1ax = tf.placeholder(tf.float32, shape = full_data_shape1ax)
net_class1ax = NetFactory.create(net_type1ax)
net1ax = net_class1ax(num_classes = class_num1ax,w_regularizer = None,
b_regularizer = None, name = net_name1ax)
net1ax.set_params(config_net1ax)
predicty1ax, caty1ax = net1ax(x1ax, is_training = True)
proby1ax = tf.nn.softmax(predicty1ax)
print("graph for 1st network1ax is constructed")
# construct graph for 1st network sagittal
net_type1sg = config_net1sg['net_type']
net_name1sg = config_net1sg['net_name']
data_shape1sg = config_net1sg['data_shape']
label_shape1sg = config_net1sg['label_shape']
class_num1sg = config_net1sg['class_num']
full_data_shape1sg = [batch_size] + data_shape1sg
x1sg = tf.placeholder(tf.float32, shape = full_data_shape1sg)
net_class1sg = NetFactory.create(net_type1sg)
net1sg = net_class1sg(num_classes = class_num1sg,w_regularizer = None,
b_regularizer = None, name = net_name1sg)
net1sg.set_params(config_net1sg)
predicty1sg, caty1sg = net1sg(x1sg, is_training = True)
proby1sg = tf.nn.softmax(predicty1sg)
print("graph for 1st network1sg is constructed")
# construct graph for 1st network coronal
net_type1cr = config_net1cr['net_type']
net_name1cr = config_net1cr['net_name']
data_shape1cr = config_net1cr['data_shape']
label_shape1cr = config_net1cr['label_shape']
class_num1cr = config_net1cr['class_num']
full_data_shape1cr = [batch_size] + data_shape1cr
x1cr = tf.placeholder(tf.float32, shape = full_data_shape1cr)
net_class1cr = NetFactory.create(net_type1cr)
net1cr = net_class1cr(num_classes = class_num1cr,w_regularizer = None,
b_regularizer = None, name = net_name1cr)
net1cr.set_params(config_net1cr)
predicty1cr, caty1cr = net1cr(x1cr, is_training = True)
proby1cr = tf.nn.softmax(predicty1cr)
print("graph for 1st network1cr is constructed")
# 3, create session and load trained models
all_vars = tf.global_variables()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
if(config_net1):
net1_vars = [x for x in all_vars if x.name[0:len(net_name1) + 1]==net_name1 + '/']
saver1 = tf.train.Saver(net1_vars)
saver1.restore(sess, config_net1['model_file'])
else:
net1ax_vars = [x for x in all_vars if x.name[0:len(net_name1ax) + 1]==net_name1ax + '/']
saver1ax = tf.train.Saver(net1ax_vars)
saver1ax.restore(sess, config_net1ax['model_file'])
net1sg_vars = [x for x in all_vars if x.name[0:len(net_name1sg) + 1]==net_name1sg + '/']
saver1sg = tf.train.Saver(net1sg_vars)
saver1sg.restore(sess, config_net1sg['model_file'])
net1cr_vars = [x for x in all_vars if x.name[0:len(net_name1cr) + 1]==net_name1cr + '/']
saver1cr = tf.train.Saver(net1cr_vars)
saver1cr.restore(sess, config_net1cr['model_file'])
print("all variables of net1 is saved")
# 4, load test images
dataloader = DataLoader(config_data)
dataloader.load_data()
image_num = dataloader.get_total_image_number()
# 5, start to test
test_slice_direction = config_test.get('test_slice_direction', 'all')
save_folder = config_data['save_folder']
test_time = []
struct = ndimage.generate_binary_structure(3, 2)
margin = config_test.get('roi_patch_margin', 5)
x=['x1','x2']
paddings=tf.constant([[0,0],[0,0],[10,10],[0,0],[0,0]])
for i in range(image_num):
[temp_imgs, temp_weight, temp_name, img_names, temp_bbox, temp_size] = dataloader.get_image_data_with_name(i)
t0 = time.time()
# 5.1, test of 1st network
if(config_net1):
data_shapes = [ data_shape1[:-1], data_shape1[:-1], data_shape1[:-1]]
label_shapes = [label_shape1[:-1], label_shape1[:-1], label_shape1[:-1]]
nets = [net1, net1, net1]
outputs = [proby1, proby1, proby1]
inputs = [x1, x1, x1]
class_num = class_num1
else:
data_shapes = [ data_shape1ax[:-1], data_shape1sg[:-1], data_shape1cr[:-1]]
label_shapes = [label_shape1ax[:-1], label_shape1sg[:-1], label_shape1cr[:-1]]
nets = [net1ax, net1sg, net1cr]
outputs = [proby1ax, proby1sg, proby1cr]
inputs = [x1ax, x1sg, x1cr]
class_num = class_num1ax
predi=tf.concat([predicty1ax,tf.reshape(predicty1sg,[5,11,180,160,2]),tf.pad(predicty1cr,paddings,"CONSTANT")],0)
cati=tf.concat([caty1ax,tf.reshape(caty1sg,[5,11,180,160,14]),tf.pad(caty1cr,paddings,"CONSTANT")],0)
prob1 = test_one_image_three_nets_adaptive_shape(temp_imgs, data_shapes, label_shapes, data_shape1ax[-1], class_num,
batch_size, sess, nets, outputs, inputs, shape_mode = 0)
pred1 = np.asarray(np.argmax(prob1, axis = 3), np.uint16)
pred1 = pred1 * temp_weight
print("net1 is tested")
globals()[x[i]]=predi
test_time.append(time.time() - t0)
print(temp_name)
test_time = np.asarray(test_time)
print('test time', test_time.mean())
np.savetxt(save_folder + '/test_time.txt', test_time)
if __name__ == '__main__':
if(len(sys.argv) != 2):
print('Number of arguments should be 2. e.g.')
print(' python test.py config17/test_all_class.txt')
exit()
config_file = str(sys.argv[1])
assert(os.path.isfile(config_file))
test(config_file)
y=tf.stack([x1,x2],0)
z=tf.Session().run(y)
the output is a tensor(y) that I want to convert it to numpy array using tf.Session().run() but I get this error:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [5,19,180,160,4]
[[Node: Placeholder = Placeholderdtype=DT_FLOAT, shape=[5,19,180,160,4], _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Note, this answer is based on a deep look in the crystal ball, predicting the code, which seems to be classified -- at least not written in the question itself.
Have a look at the error message:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor
This is exactly, what is wrong with your code. Trimming down, your code is essentially just (there are a lot of issues):
import tensorflow as tf
x1 = tf.placeholder(tf.float32, [None, 3])
y = tf.layers.dense(x1, 2)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
print(tf.Session().run(y))
The output tensor y cannot be evaluated without knowing the value of x1, since it depends on this value.
1. Fix use proper naming
import tensorflow as tf
x1 = tf.placeholder(tf.float32, [None, 3], name='my_input')
y = tf.layers.dense(x1, 2, name='fc1')
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
print(tf.Session().run(y))
Now the error-message becomes much clearer
tensorflow.python.framework.errors_impl.InvalidArgumentError: You must feed a value for placeholder tensor 'my_input' with dtype float and shape [?,3]
2. Fix: provide a feed_dict
To let TensorFlow know, which value the computation of y should be based on, you need to feed it into the graph:
import tensorflow as tf
x1 = tf.placeholder(tf.float32, [None, 3], name='my_input')
y = tf.layers.dense(x1, 2, name='fc1')
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
np_result = tf.Session().run(y, feed_dict={x1: [[42, 43, 44]]})
Now, this reveals the second issue with your code. You have 2 sessions:
sess = tf.InteractiveSession() (session_a)
tf.Session() in tf.Session().run() (session_b)
Now, session_a get all initialized variables, since your code contains
sess.run(tf.global_variables_initializer())
But, during tf.Session().run(...) another session is created, leaving a new error message:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value ...
3. Fix: use just one session
import tensorflow as tf
x1 = tf.placeholder(tf.float32, [None, 3], name='my_input')
y = tf.layers.dense(x1, 2, name='fc1')
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
np_result = sess.run(y, feed_dict={x1: [[42, 43, 44]]})
And to provide, the best possible solution:
import tensorflow as tf
# construct graph somewhere
x1 = tf.placeholder(tf.float32, [None, 3], name='my_input')
y = tf.layers.dense(x1, 2, name='fc1')
with tf.Session() as sess:
# init variables / or load them
sess.run(tf.global_variables_initializer())
# make sure, that no operations willl be added to the graph
sess.graph.finalize()
# fetch result as numpy array
np_result = sess.run(y, feed_dict={x1: [[42, 43, 44]]})
The code you either wrote yourself or copied from somewhere is the best demonstration of "How to not write in tensorflow."
last remark:
TensorFlow forces you to create a clean structure. This is important. It should become a habit to follow this structure. After a while, you see these parts immediately, which smells like bad code.
If you use an entire network, then just replace tf.layers.dense with my_network_definition and
def my_network_definition(x1):
output = ...
return output
In pytorch, you can write in the arbitrary style like you provided in the question. Not saying, you should do that. But it is possible. So then, try to follow the structure TensorFlow expects from you.
Dear pytorch users, I am looking forward to your feedback.

how to restore variables in fully_connected function

In my training file(train.py), I write:
def deep_part(self):
with tf.variable_scope("deep-part"):
y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.factor_size]) # None * (F*K)
# self.deep_layers = 2
for i in range(0,len(self.deep_layers)):
y_deep = tf.contrib.layers.fully_connected(y_deep, self.deep_layers[i], \
activation_fn=self.deep_layers_activation, scope = 'fc%d' % i)
return y_deep
now in predict file(predict.py), I restore the checkpoint, but I dont know how to reload the "deep-part" network's weights and biases.Because I think the "fully_conncted" function might hide the weights and biases.
I wrote a lengthy explanation here. A short summary:
By saver.save(sess, '/tmp/my_model') Tensorflow produces multiple files:
checkpoint
my_model.data-00000-of-00001
my_model.index
my_model.meta
The checkpoint file checkpoint is just a pointer to the latest version of our model-weights and it is simply a plain text file containing
$ !cat /tmp/model/checkpoint
model_checkpoint_path: "/tmp/my_model"
all_model_checkpoint_paths: "/tmp/my_model"
The others are binary files containing the graph (.meta) and weights (.data*).
You can help yourself by running
import tensorflow as tf
import numpy as np
data = np.arange(9 * 1).reshape(1, 9).astype(np.float32)
plhdr = tf.placeholder(tf.float32, shape=[1, 9], name='input')
print plhdr.name
activation = tf.layers.dense(plhdr, 10, name='fc')
print activation.name
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
expected = sess.run(activation, {plhdr: data})
print expected
saver = tf.train.Saver(tf.global_variables())
saver.save(sess, '/tmp/my_model')
tf.reset_default_graph()
with tf.Session() as sess:
# load the computation graph (the fully connected + placeholder)
loader = tf.train.import_meta_graph('/tmp/my_model.meta')
sess.run(tf.global_variables_initializer())
plhdr = tf.get_default_graph().get_tensor_by_name('input:0')
activation = tf.get_default_graph().get_tensor_by_name('fc/BiasAdd:0')
actual = sess.run(activation, {plhdr: data})
assert np.allclose(actual, expected) is False
# now load the weights
loader = loader.restore(sess, '/tmp/my_model')
actual = sess.run(activation, {plhdr: data})
assert np.allclose(actual, expected) is True

TensorFlow: loss jumps up after restoring RNN net

Environment info
Operating System: Windows 7 64-bit
Tensorflow installed from pre-built pip (no CUDA): 1.0.1
Python 3.5.2 64-bit
Problem
I have problems with restoring my net (RNN character base language model). Below is a simplified version with the same problem.
When I run it the first time, I get, for example, this.
...
step 160: loss = 1.956 (perplexity = 7.069016620211226)
step 180: loss = 1.837 (perplexity = 6.274748642468816)
step 200: loss = 1.825 (perplexity = 6.202084762557817)
But on the second run, after restoring parameters, I get this.
step 220: loss = 2.346 (perplexity = 10.446611983898903)
step 240: loss = 2.346 (perplexity = 10.446709120339545)
...
All the tf variables seem to be correctly restored, including the state, which will be fed to RNN.
Data position is also restored (from 'step').
I also made a similar program for MNIST recognition model, and this one works fine: the losses before and after the restoring are continuous.
Are there any other parameters or states that should be saved and restored?
import argparse
import os
import tensorflow as tf
import numpy as np
import math
B = 20 # batch size
H = 200 # size of hidden layer of neurons
T = 25 # number of time steps to unroll the RNN for
data_file = 'ptb.train.txt' # any plain text file will do
checkpoint_dir = "tmp"
#----------------
# prepare data
#----------------
data = open(data_file, 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has {0} characters, {1} unique.'.format(data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
input_index_raw = np.array([char_to_ix[ch] for ch in data])
input_index_raw = input_index_raw[0:len(input_index_raw) // T * T]
input_index_raw_shift = np.append(input_index_raw[1:], input_index_raw[0])
input_all = input_index_raw.reshape([-1, T])
target_all = input_index_raw_shift.reshape([-1, T])
num_packed_data = len(input_all)
#----------------
# build model
#----------------
class Model(object):
def __init__(self):
self.input_ph = tf.placeholder(tf.int32, [None, T], name="input_ph")
self.target_ph = tf.placeholder(tf.int32, [None, T], name="target_ph")
embedding = tf.get_variable("embedding", [vocab_size, H], initializer=tf.random_normal_initializer(), dtype=tf.float32)
# input_ph is B x T.
# input_embedded is B x T x H.
input_embedded = tf.nn.embedding_lookup(embedding, self.input_ph)
cell = tf.contrib.rnn.BasicRNNCell(H)
self.state_ph = tf.placeholder(tf.float32, (None, cell.state_size), name="state_ph")
# Make state variable so that it will be saved by the saver.
self.state = tf.get_variable("state", (B, cell.state_size), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.float32)
# Construct initial_state according to whether restoring or not.
self.isRestore = tf.placeholder(tf.bool, shape=(), name="isRestore")
zero_state = cell.zero_state(B, dtype=tf.float32)
self.initial_state = tf.cond(self.isRestore, lambda: self.state, lambda: zero_state)
# input_embedded : B x T x H
# output: B x T x H
# state : B x cell.state_size
output, state_ = tf.nn.dynamic_rnn(cell, input_embedded, initial_state=self.state_ph)
self.final_state = tf.assign(self.state, state_)
# reshape to (B * T) x H.
output_flat = tf.reshape(output, [-1, H])
# Convert hidden layer's output to vector of logits for each vocabulary.
softmax_w = tf.get_variable("softmax_w", [H, vocab_size], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32)
logits = tf.matmul(output_flat, softmax_w) + softmax_b
# cross_entropy is a vector of length B * T
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.target_ph, [-1]), logits=logits)
self.loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
self.global_step = tf.get_variable("global_step", (), initializer=tf.zeros_initializer(), trainable=False, dtype=tf.int32)
self.training_op = optimizer.minimize(cross_entropy, global_step=self.global_step)
def train_batch(self, sess, input_batch, target_batch, initial_state):
final_state_, _, final_loss = sess.run([self.final_state, self.training_op, self.loss], feed_dict={self.input_ph: input_batch, self.target_ph: target_batch, self.state_ph: initial_state})
return final_state_, final_loss
# main
with tf.Session() as sess:
if not tf.gfile.Exists(checkpoint_dir):
tf.gfile.MakeDirs(checkpoint_dir)
batch_stride = num_packed_data // B
# make model
model = Model()
saver = tf.train.Saver()
# always initialize
init = tf.global_variables_initializer()
init.run()
# restore if necessary
isRestore = False
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt:
isRestore = True
last_model = ckpt.model_checkpoint_path
print("Loading " + last_model)
saver.restore(sess, last_model)
# set initial step
step = tf.train.global_step(sess, model.global_step) + 1
print("start step = {0}".format(step))
# fetch initial state
state = sess.run(model.initial_state, feed_dict={model.isRestore: isRestore})
print("Initial state: {0}".format(state))
while True:
# prepare batch data
idx = [(step + x * batch_stride) % num_packed_data for x in range(0, B)]
input_batch = input_all[idx]
target_batch = target_all[idx]
state, last_loss = model.train_batch(sess, input_batch, target_batch, state)
if step % 20 == 0:
print('step {0}: loss = {1:.3f} (perplexity = {2})'.format(step, last_loss, math.exp(last_loss)))
if step % 200 == 0:
saved_file = saver.save(sess, os.path.join(checkpoint_dir, "model.ckpt"), global_step=step)
print("Saved to " + saved_file)
print("Last state: {0}".format(model.state.eval()))
break;
step = step + 1
The problem is solved. It had nothing to do with RNN nor TensorFlow.
I changed
chars = list(set(data))
to
chars = sorted(set(data))
and now it works.
This is because python uses a random hash function to build the set, and every time python restarted, 'chars' had a different ordering.