tf.saved_model.builder.SavedModelBuilder.save runs forever when graph contains big variable - tensorflow

I am trying to export my model, I used tf.saved_model.builder.SavedModelBuilder.save. In my model, I have a variable for word_embedding, in the test version, it is a 100k file, the save works well. After replacing it with the full version, Glove 300d, which is 1GB. The save runs forever.
def main():
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
with tf.device('device:CPU:0'):
saver = tf.train.import_meta_graph('c:\\myModel\\model.weights\\.meta', clear_devices=True)
sess.run(init_op)
saver.restore(sess, "c:\\myModel\\model.weights\\")
# inputs
word_ids = sess.graph.get_tensor_by_name('word_ids:0')
sequence_lengths = sess.graph.get_tensor_by_name('sequence_lengths:0')
feature_vecs = sess.graph.get_tensor_by_name('feature_vecs:0')
# output
logits = sess.graph.get_tensor_by_name('proj/Reshape_1:0')
transitions = sess.graph.get_tensor_by_name('transitions:0')
word_ids_info = utils.build_tensor_info(word_ids)
sequence_lengths_info = utils.build_tensor_info(sequence_lengths)
feature_vecs_info = utils.build_tensor_info(sequence_lengths)
dropout_info = utils.build_tensor_info(word_ids)
logits_info = utils.build_tensor_info(logits)
transitions_info = utils.build_tensor_info(transitions)
prediction_signature = signature_def_utils.build_signature_def(
inputs = { 'word_ids' : word_ids_info, 'sequence_lengths' : sequence_lengths_info, 'feature_vecs' : feature_vecs_info},
outputs = { 'logits' : logits_info, 'transitions' : transitions_info },
method_name = signature_constants.PREDICT_METHOD_NAME)
legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
builder = saved_model_builder.SavedModelBuilder('c:\\myModel\\exported')
builder.add_meta_graph_and_variables(
sess,
[tag_constants.SERVING],
signature_def_map={
signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY : prediction_signature,
},
legacy_init_op=legacy_init_op)
builder.save(as_text=True)

Related

Retrain Frozen Graph in Tensorflow 2.x

I have managed this implementation on retraining frozen graph in tensorflow 1 according to this wonderful detail topic. Basically, the methodology is described:
Load frozen model
Replace the constant frozen node with variable node.
The newly replaced variable node then will be redirected to the corresponding output of the frozen node.
This works in tensorflow 1.x by checking the tf.compat.v1.trainable_variables. However, in tensorflow 2.x, it can't work anymore.
Below is the code snippet:
1/ Load frozen model
frozen_path = '...'
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.compat.v1.GraphDef()
with tf.compat.v1.io.gfile.GFile(frozen_path, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.graph_util.import_graph_def(od_graph_def, name='')
2/ Create a clone
with detection_graph.as_default():
const_var_name_pairs = {}
probable_variables = [op for op in detection_graph.get_operations() if op.type == "Const"]
available_names = [op.name for op in detection_graph.get_operations()]
for op in probable_variables:
name = op.name
if name+'/read' not in available_names:
continue
tensor = detection_graph.get_tensor_by_name('{}:0'.format(name))
with tf.compat.v1.Session() as s:
tensor_as_numpy_array = s.run(tensor)
var_shape = tensor.get_shape()
# Give each variable a name that doesn't already exist in the graph
var_name = '{}_turned_var'.format(name)
var = tf.Variable(name=var_name, dtype=op.outputs[0].dtype, initial_value=tensor_as_numpy_array,trainable=True, shape=var_shape)
const_var_name_pairs[name] = var_name
3/ Relace frozen node by Graph Editor
import graph_def_editor as ge
ge_graph = ge.Graph(detection_graph.as_graph_def())
name_to_op = dict([(n.name, n) for n in ge_graph.nodes])
for const_name, var_name in const_var_name_pairs.items():
const_op = name_to_op[const_name+'/read']
var_reader_op = name_to_op[var_name + '/Read/ReadVariableOp']
ge.swap_outputs(ge.sgv(const_op), ge.sgv(var_reader_op))
detection_training_graph = ge_graph.to_tf_graph()
with detection_training_graph.as_default():
writer = tf.compat.v1.summary.FileWriter('remap', detection_training_graph )
writer.close
The problem was my Graph Editor when I import the tf.graph_def instead of the original tf.graph that has Variables.
Quickly solve by fixing step 3
Sol1: Using Graph Editor
ge_graph = ge.Graph(detection_graph)
for const_name, var_name in const_var_name_pairs.items():
const_op = ge_graph._node_name_to_node[const_name+'/read']
var_reader_op = ge_graph._node_name_to_node[var_name+'/Read/ReadVariableOp']
ge.swap_outputs(ge.sgv(const_op), ge.sgv(var_reader_op))
However, this requires disable eager execution. To work around with eager execution, you should attach the MetaGraphDef to Graph Editor as below
with detection_graph.as_default():
meta_saver = tf.compat.v1.train.Saver()
meta = meta_saver.export_meta_graph()
ge_graph = ge.Graph(detection_graph,collections=ge.graph._extract_collection_defs(meta))
However, this is the trickest to make the model trainable in tf2.x
Instead of using Graph Editor to export directly the graph, we should export ourselves. The reason is that the Graph Editor make the Variables data type to be resources. Therefore, we should export the graph as graphdef and import the variable def to the graph:
test_graph = tf.Graph()
with test_graph.as_default():
tf.import_graph_def(ge_graph.to_graph_def(), name="")
for var_name in ge_graph.variable_names:
var = ge_graph.get_variable_by_name(var_name)
ret = variable_pb2.VariableDef()
ret.variable_name = var._variable_name
ret.initial_value_name = var._initial_value_name
ret.initializer_name = var._initializer_name
ret.snapshot_name = var._snapshot_name
ret.trainable = var._trainable
ret.is_resource = True
tf_var = tf.Variable(variable_def=ret,dtype=tf.float32)
test_graph.add_to_collections(var.collection_names, tf_var)
Sol2: Manually map by Graphdef
with detection_graph.as_default() as graph:
training_graph_def = remap_input_node(detection_graph.as_graph_def(),const_var_name_pairs)
current_var = (tf.compat.v1.trainable_variables())
assert len(current_var)>0, "no training variables"
detection_training_graph = tf.Graph()
with detection_training_graph.as_default():
tf.graph_util.import_graph_def(training_graph_def, name='')
for var in current_var:
ret = variable_pb2.VariableDef()
ret.variable_name = var.name
ret.initial_value_name = var.name[:-2] + '/Initializer/initial_value:0'
ret.initializer_name = var.name[:-2] + '/Assign'
ret.snapshot_name = var.name[:-2] + '/Read/ReadVariableOp:0'
ret.trainable = True
ret.is_resource = True
tf_var = tf.Variable(variable_def=ret,dtype=tf.float32)
detection_training_graph.add_to_collections({'trainable_variables', 'variables'}, tf_var)
current_var = (tf.compat.v1.trainable_variables())
assert len(current_var)>0, "no training variables"

FailedPreconditionError: FailedPr...onError()

I have FailedPreconditionError when running sess.
My network has two different parts, pretrained-network and new add in Recognition network.
Pretrained model is used to extract features and the feature is used to train again for recognition.
In my code, pre-trained model is loaded first.
graph = tf.Graph()
with graph.as_default():
input_data, input_labels, input_boxes = input_train_data.input_fn()
input_boxes = tf.reshape(input_boxes,[input_boxes.shape[0]*2,-1])#convert from Nx8 to 2Nx4
# build model and loss
net = Net(input_data, is_training = False)
f_saver = tf.train.Saver(max_to_keep=1000, write_version=tf.train.SaverDef.V2, save_relative_paths=True)
sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True)
if FLAGS.gpu_memory_fraction < 0:
sess_config.gpu_options.allow_growth = True
elif FLAGS.gpu_memory_fraction > 0:
sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction;
session = tf.Session(graph=graph, config=sess_config)
tf.logging.info('Initialize from: ' + config.train.init_checkpoint)
f_saver.restore(session, config.train.init_checkpoint)
f_saver restores the pre-trained model.
Then feature conv5_3 is extracted and fed into Recognition network.
conv5_3 = net.end_points['conv5_3']
with tf.variable_scope("Recognition"):
global_step_rec = tf.Variable(0, name='global_step_rec', trainable=False)
#Pass through recognition net
r_net = regnet.ConstructRecNet(conv5_3)
conv7_7 = r_net.end_points['pool7']
#implement ROI Pooling
#input boxes be in x1, y1, x2, y2
h_fmap = tf.dtypes.cast(tf.shape(conv7_7)[1],tf.float32)
w_fmap = tf.dtypes.cast(tf.shape(conv7_7)[2],tf.float32)
#remap boxes at input images to feature mats
#input_boxes = input_boxes / tf.constant([config.train.input_shape[0], config.train.input_shape[0],\
# config.train.input_shape[0], config.train.input_shape[0]], dtype=tf.float32)#Normalize with image size first
remap_boxes=tf.matmul(input_boxes,tf.diag([w_fmap,h_fmap,w_fmap,h_fmap]))
#put first column with image indexes
rows = tf.expand_dims(tf.range(remap_boxes.shape[0]), 1)/2
add_index = tf.concat([tf.cast(rows,tf.float32),remap_boxes],-1)
index = tf.not_equal(tf.reduce_sum(add_index[:,4:],axis=1),0)
remap_boxes = tf.gather_nd(add_index,tf.where(index))
remap_boxes=tf.dtypes.cast(remap_boxes,tf.int32)
prob = roi_pooling(conv7_7, remap_boxes, pool_height=1, pool_width=28)
#Get features for CTC training
prob = tf.transpose(prob, (1, 0, 2)) # prepare for CTC
data_length = tf.fill([tf.shape(prob)[1]], tf.shape(prob)[0]) # input seq length, batch size
ctc = tf.py_func(CTCUtils.compute_ctc_from_labels, [input_labels], [tf.int64, tf.int64, tf.int64])
ctc_labels = tf.to_int32(tf.SparseTensor(ctc[0], ctc[1], ctc[2]))
predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(prob, data_length, merge_repeated=False, beam_width=10)[0][0])
tf.sparse_tensor_to_dense(predictions, default_value=-1, name='d_predictions')
tf.reduce_mean(tf.edit_distance(predictions, ctc_labels, normalize=False), name='error_rate')
loss = tf.reduce_mean(tf.compat.v1.nn.ctc_loss(inputs=prob, labels=ctc_labels, sequence_length=data_length, ctc_merge_repeated=True), name='loss')
learning_rate = tf.train.piecewise_constant(global_step_rec, [150000, 200000],[config.train.learning_rate, 0.1 * config.train.learning_rate,0.01 * config.train.learning_rate])
opt_loss = tf.contrib.layers.optimize_loss(loss, global_step_rec, learning_rate, config.train.opt_type,config.train.grad_noise_scale, name='train_step')
tf.global_variables_initializer()
I can run sess till feature extraction conv5_3. But can't run those in Recognition and got error as FailedPreconditionError: FailedPr...onError(). What could be the problem?
graph.finalize()
with tf.variable_scope("Recognition"):
for i in range(config.train.steps):
input_data_, input_labels_, input_boxes_ = session.run([input_data, input_labels, input_boxes])
conv5_3_ = session.run([conv5_3]) #can run this line
global_step_rec_ = session.run([global_step_rec]) # got FailedPreconditionError: FailedPr...onError() error at this line
conv7_7_ = session.run([conv7_7])
h_fmap_ = session.run([h_fmap])
Now it works.
Since my graph has two parts, I need to initialize separately.
(1)First get all variables from pre-trained model to initialize with those from checkpoint.
Then initialize with tf.train.Saver.
(2)Then initialize the rest add-in layers using tf.global_variables_initializer()
My code is as follow.
#Initialization
#Initialize pre-trained model first
#Since we need to restore pre-trained model and initialize to respective variables in this current graph
#(1)make a variable list for checkpoint
#(2)initialize a saver for the variable list
#(3)then restore
#(1)
def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
varlist=[]
reader = pywrap_tensorflow.NewCheckpointReader(file_name)
if all_tensors:
var_to_shape_map = reader.get_variable_to_shape_map()
for key in sorted(var_to_shape_map):
print(key)
varlist.append(key)
return varlist
varlist=print_tensors_in_checkpoint_file(file_name=config.train.init_checkpoint,all_tensors=True,tensor_name=None)
#(2)prepare the list of variables by calling variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
#countcheckpt_vars=0
#for n in tf.get_default_graph().as_graph_def().node:
# print(n.name)
#for op in tf.get_default_graph().get_operations():
# print(str(op.name))
#for var in zip(variables):
# countcheckpt_vars=countcheckpt_vars+1
#(3)
loader = tf.train.Saver(variables[:46])#since I need to initialize only 46 variables from global variables
tf.logging.info('Initialize from: ' + config.train.init_checkpoint)
sess_config = tf.ConfigProto(log_device_placement = False, allow_soft_placement = True)
if FLAGS.gpu_memory_fraction < 0:
sess_config.gpu_options.allow_growth = True
elif FLAGS.gpu_memory_fraction > 0:
sess_config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction;
session = tf.Session(graph=graph, config=sess_config)
loader.restore(session, config.train.init_checkpoint)
Then initialize the rest of variables
init = tf.global_variables_initializer()
session.run(init)

A2C is not working due to critic loss is not converging

I'm trying to do my own implementation of the Advantage Actor Critic algorithm by using tensorflow. I used the code in https://github.com/BoYanSTKO/Practical_RL-coursera/blob/master/week5_policy_based/practice_a3c.ipynb as a rough template on how I should write the algorithm.
I tried it on the simple CartPole-v0 gym environment but by implementation fails badly. The critics loss just explodes and becomes way to large while the actors loss is rather low.
I'm not sure what I'm doing wrong here. Any help? :)
I've tried separating the actor and critic from each other by having 2 different networks. This did not help either. Have also tried fine tuning some stuff like gamma and learning rate without any success.
!/usr/bin/python
import tensorflow as tf
import numpy as np
import gym
import random
from tensorboardX import SummaryWriter
class ActorCritic():
def __init__(self,state_dim,n_actions,learning_rate,gamma=0.99):
with tf.variable_scope("ActorCritic"):
self.states_ph = tf.placeholder(tf.float32,(None,state_dim),name="states")
self.action_ph = tf.placeholder(tf.int32,(None,),name="actions")
self.n_actions = n_actions
self.reward_ph = tf.placeholder(tf.float32,(None,),name="rewards")
self.next_state_values = tf.placeholder(tf.float32,(None,),name="rewards")
self.is_done_ph = tf.placeholder(tf.float32,(None,),name="rewards")
net = tf.layers.dense(self.states_ph,24,activation=tf.nn.relu)
self.logits = tf.layers.dense(net,n_actions,activation=None)
self.state_values = tf.layers.dense(net,1,activation=None)
self.action_probs = tf.nn.softmax(self.logits)
self.log_prob = tf.nn.log_softmax(self.logits)
self.entropy = -tf.reduce_sum(self.action_probs*self.log_prob,axis=-1,name="entropy")
self.logp_actions = tf.reduce_sum(self.log_prob*tf.one_hot(self.action_ph,depth=n_actions),axis=-1)
self.target_state_values = self.reward_ph + gamma*(1.0-self.is_done_ph)*self.next_state_values
self.advantage = self.target_state_values - self.state_values
self.actor_loss = -tf.reduce_mean(self.logp_actions * tf.stop_gradient(self.advantage)) - 0.01*tf.reduce_mean(self.entropy)
self.critic_loss = tf.reduce_mean(self.advantage**2.0)
self.train_opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.actor_loss+self.critic_loss)
def train(self,states,actions,rewards,is_done,nxt_state_values_batch):
sess = tf.get_default_session()
return sess.run([self.critic_loss,self.actor_loss,self.train_opt],feed_dict={
self.next_state_values:nxt_state_values_batch,
self.states_ph:states,
self.action_ph:actions,
self.reward_ph:rewards,
self.is_done_ph:is_done})
def predict_state_values(self,states):
sess = tf.get_default_session()
return sess.run(self.state_values,feed_dict={self.states_ph:states})
def sample_actions(self,states):
sess = tf.get_default_session()
action_probs = sess.run(self.action_probs,{self.states_ph:states})
return [ np.random.choice(range(self.n_actions),p=action_prob) for action_prob in action_probs ]
class EnvBatch():
def __init__(self,env_name,n_envs):
self.envs = [gym.make(env_name) for env in range(n_envs)]
self.n_actions = self.envs[0].action_space.n
self.state_dim = self.envs[0].observation_space.shape[0]
def reset(self):
return [env.reset().tolist() for env in self.envs ]
def step(self,actions):
states_batch, rewards_batch, is_done_batch = [], [], []
for action, env in zip(actions,self.envs):
s, r , d, _ = env.step(action)
if d:
s = env.reset()
states_batch.append(s)
rewards_batch.append(r)
is_done_batch.append(d)
return np.array(states_batch), np.array(rewards_batch), np.array(is_done_batch)
def evaluate_performance(env_name,agent,nr_runs=10):
env = gym.make(env_name)
rewards = []
for _ in range(nr_runs):
state = env.reset()
is_done = False
acc_reward = 0.0
while not is_done:
action = agent.sample_actions([state])
nxt_state, reward, is_done, _ = env.step(action[0])
state = nxt_state
acc_reward += reward
rewards.append(acc_reward)
return np.mean(rewards)
tf.reset_default_graph()
env = EnvBatch("CartPole-v0",10)
agent = ActorCritic(env.state_dim,env.n_actions,learning_rate=0.001)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
state_batch = env.reset()
writer = SummaryWriter()
for i in range(100000):
actions = agent.sample_actions(state_batch)
nxt_state_batch, rewards_batch, is_done_batch = env.step(actions)
nxt_state_values = agent.predict_state_values(nxt_state_batch).ravel()
critic_loss, actor_loss, _ = agent.train(state_batch,actions,rewards_batch,is_done_batch,nxt_state_values)
writer.add_scalar("actor_loss",actor_loss,i)
writer.add_scalar("critic_loss",critic_loss,i)
if i%50==0:
test_reward = evaluate_performance("CartPole-v0",agent)
writer.add_scalar("test_reward",test_reward,i)
if test_reward > 195:
print "Done!"
states_batch = nxt_state_batch
sess.close()
writer.close()

TF-serving with NMT

I am working on exporting a translation model for serving using TF-Serving.
I have referred the issues in the below link.
https://github.com/tensorflow/serving/issues/712
The model which is being served always seems to give the same result irrespective of the input it receives. I am using the below code.
def export(self):
infer_model = self._create_infer_model()
with tf.Session(graph=infer_model.graph,
config=tf.ConfigProto(allow_soft_placement=True)) as sess:
feature_config = {
'input': tf.FixedLenSequenceFeature(dtype=tf.string, shape=[], allow_missing=True),
}
#serialized_example = tf.placeholder(dtype=tf.string, name="tf_example")
#tf_example = tf.parse_example(serialized_example, feature_config)
tf_example = ['This is created just for export']
inference_input = tf.identity(tf_example, name="inference_input")
#batch_size_placeholder = tf.constant(1, shape=[1,], dtype=tf.int64)
saver = infer_model.model.saver
saver.restore(sess, self._ckpt_path)
# initialize tables
sess.run(tf.tables_initializer())
sess.run(
infer_model.iterator.initializer,
feed_dict={
infer_model.src_placeholder: inference_input.eval()
})
# get outputs of model
inference_outputs, _ = infer_model.model.decode(sess=sess)
#inference_outputs = infer_model.model.sample_words
#get the first of the outputs as the result of inference
inference_output = inference_outputs[0]
# create signature def
# key `seq_input` in `inputs` dict could be changed as your will,
# but the client should consistent with this
# when you make an inference request.
# key `seq_output` in outputs dict is the same as above
inference_signature = tf.saved_model.signature_def_utils.predict_signature_def(
inputs={
'seq_input': infer_model.src_placeholder
},
outputs={
'seq_output': tf.convert_to_tensor(inference_output)
}
)
legacy_ini_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
builder = tf.saved_model.builder.SavedModelBuilder(self._export_dir)
# key `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
# (is `serving_default` actually) in signature_def_map could be changed
# as your will. But the client should consistent with this when you make an inference request.
builder.add_meta_graph_and_variables(
sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map={
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: inference_signature,
},
legacy_init_op=legacy_ini_op,
clear_devices=True,
assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS))
builder.save(as_text=True)
print("Done!")
In this case I am always getting the output as
"This is just for export"
Any assistance would be great.
Thanks,
Sujith.

TensorFlow: string_input_producer output string not in order with shuffle = False

I got 2 list of filenames like filenames_L = [1a,2a,3a,4a,...] and filenames_R = [1b,2b,3b,4b,...] , and I use the code below to make 2 queues.
"""queue for left images"""
filenames_L = reader.file_name('stereo_dataset/fly_frames_cleanpass/TRAIN', 'Left', 'png')
png = filenames_L[0].lower().endswith('png') # If first file is a png, assume they all are
filenames_L = tf.convert_to_tensor(filenames_L)
filename_queue_L = tf.train.string_input_producer(filenames_L, shuffle=False, num_epochs=FLAGS.epoch)
reader_L = tf.WholeFileReader()
name_L, img_bytes_L = reader_L.read(filename_queue_L)
image_L = tf.image.decode_png(img_bytes_L, channels=3) if png else tf.image.decode_jpeg(img_bytes_L, channels=3)
processed_image_L = image_preprocessing_fn(image_L, FLAGS.height, FLAGS.width)
processed_images_L = tf.train.batch([processed_image_L], FLAGS.batch_size, dynamic_pad=True)
"""queue for right images"""
filenames_R = reader.file_name('stereo_dataset/fly_frames_cleanpass/TRAIN', 'Right', 'png')
filenames_R = tf.convert_to_tensor(filenames_R)
filename_queue_R = tf.train.string_input_producer(filenames_R, shuffle=False, num_epochs=FLAGS.epoch)
reader_R = tf.WholeFileReader()
name_R, img_bytes_R = reader_R.read(filename_queue_R)
image_R = tf.image.decode_png(img_bytes_R, channels=3) if png else tf.image.decode_jpeg(img_bytes_R, channels=3)
processed_image_R = image_preprocessing_fn(image_R, FLAGS.height, FLAGS.width)
processed_images_R = tf.train.batch([processed_image_R], FLAGS.batch_size, dynamic_pad=True)
And then I use the code below to get their names.
with tf.Session(config=config) as sess:
sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start_time = time.time()
while not coord.should_stop():
_, loss_t, step, name_Left, name_Right = sess.run([train_op, loss, global_step, name_L, name_R], feed_dict={disparity_map: disparity})
What I got from name_Left, name_Right are (3a,3b), (5a, 5b)....But I expected it outputs like(1a,1b), (2a, 2b)...
I had a similar issue using tf.train.batch and tf.train.shuffle_batch.
The problem was in the use of threads (similar to this: https://github.com/tensorflow/tensorflow/issues/410).
I solved it by setting only one thread. When I did that, the order was preserved.