load multiple models in Tensorflow - tensorflow

I have written the following convolutional neural network (CNN) class in Tensorflow [I have tried to omit some lines of code for clarity.]
class CNN:
def __init__(self,
num_filters=16, # initial number of convolution filters
num_layers=5, # number of convolution layers
num_input=2, # number of channels in input
num_output=5, # number of channels in output
learning_rate=1e-4, # learning rate for the optimizer
display_step = 5000, # displays training results every display_step epochs
num_epoch = 10000, # number of epochs for training
batch_size= 64, # batch size for mini-batch processing
restore_file=None, # restore file (default: None)
):
# define placeholders
self.image = tf.placeholder(tf.float32, shape = (None, None, None,self.num_input))
self.groundtruth = tf.placeholder(tf.float32, shape = (None, None, None,self.num_output))
# builds CNN and compute prediction
self.pred = self._build()
# I have already created a tensorflow session and saver objects
self.sess = tf.Session()
self.saver = tf.train.Saver()
# also, I have defined the loss function and optimizer as
self.loss = self._loss_function()
self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
if restore_file is not None:
print("model exists...loading from the model")
self.saver.restore(self.sess,restore_file)
else:
print("model does not exist...initializing")
self.sess.run(tf.initialize_all_variables())
def _build(self):
#builds CNN
def _loss_function(self):
# computes loss
#
def train(self, train_x, train_y, val_x, val_y):
# uses mini batch to minimize the loss
self.sess.run(self.optimizer, feed_dict = {self.image:sample, self.groundtruth:gt})
# I save the session after n=10 epochs as:
if epoch%n==0:
self.saver.save(sess,'snapshot',global_step = epoch)
# finally my predict function is
def predict(self, X):
return self.sess.run(self.pred, feed_dict={self.image:X})
I have trained two CNNs for two separate tasks independently. Each took around 1 day. Say, model1 and model2 are saved as 'snapshot-model1-10000' and 'snapshot-model2-10000' (with their corresponding meta files) respectively. I can test each model and compute its performance separately.
Now, I want to load these two models in a single script. I would naturally try to do as below:
cnn1 = CNN(..., restore_file='snapshot-model1-10000',..........)
cnn2 = CNN(..., restore_file='snapshot-model2-10000',..........)
I encounter the error [The error message is long. I just copied/pasted a snippet of it.]
NotFoundError: Tensor name "Variable_26/Adam_1" not found in checkpoint files /home/amitkrkc/codes/A549_models/snapshot-hela-95000
[[Node: save_1/restore_slice_85 = RestoreSlice[dt=DT_FLOAT, preferred_shard=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save_1/Const_0, save_1/restore_slice_85/tensor_name, save_1/restore_slice_85/shape_and_slice)]]
Is there a way to load from these two files two separate CNNs? Any suggestion/comment/feedback is welcome.
Thank you,

Yes there is. Use separate graphs.
g1 = tf.Graph()
g2 = tf.Graph()
with g1.as_default():
cnn1 = CNN(..., restore_file='snapshot-model1-10000',..........)
with g2.as_default():
cnn2 = CNN(..., restore_file='snapshot-model2-10000',..........)
EDIT:
If you want them into same graph. You'll have to rename some variables. One idea is have each CNN in separate scope and let saver handle variables in that scope e.g.:
saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES), scope='model1')
and in cnn wrap all your construction in scope:
with tf.variable_scope('model1'):
...
EDIT2:
Other idea is renaming variables which saver manages (since I assume you want to use your saved checkpoints without retraining everything. Saving allows different variable names in graph and in checkpoint, have a look at documentation for initialization.

This should be a comment to the most up-voted answer. But I do not have enough reputation to do that.
Anyway.
If you(anyone searched and got to this point) still having trouble with the solution provided by lpp AND you are using Keras, check following quote from github.
This is because the keras share a global session if no default tf session provided
When the model1 created, it is on graph1
When the model1 loads weight, the weight is on a keras global session which is associated with graph1
When the model2 created, it is on graph2
When the model2 loads weight, the global session does not know the graph2
A solution below may help,
graph1 = Graph()
with graph1.as_default():
session1 = Session()
with session1.as_default():
with open('model1_arch.json') as arch_file:
model1 = model_from_json(arch_file.read())
model1.load_weights('model1_weights.h5')
# K.get_session() is session1
# do the same for graph2, session2, model2

You need to create 2 sessions and restore the 2 models separately. In order for this to work you need to do the following:
1a. When you're saving the models you need to add scopes to the variable names. That way you will know which variables belong to which model:
# The first model
tf.Variable(tf.zeros([self.batch_size]), name="model_1/Weights")
...
# The second model
tf.Variable(tf.zeros([self.batch_size]), name="model_2/Weights")
...
1b. Alternatively, if you already saved the models you can rename the variables by adding scope with this script.
2.. When you restore the different models you need to filter by variable name like this:
# The first model
sess_1 = tf.Session()
sess_1.run(tf.initialize_all_variables())
saver_1 = tf.train.Saver([v for v in tf.all_variables() if 'model_1' in v.name])
saver_1.restore(sess_1, weights_1_file)
sess_1.run(pred, feed_dict={image: X})
# The second model
sess_2 = tf.Session()
sess_2.run(tf.initialize_all_variables())
saver_2 = tf.train.Saver([v for v in tf.all_variables() if 'model_2' in v.name])
saver_2.restore(sess_2, weights_2_file)
sess_2.run(pred, feed_dict={image: X})

I encountered the same problem and could not solve the problem (without retraining) with any solution i found on the internet. So what I did is load each model in two separate threads which communicate with the main thread. It is simple enough to write the code, you just have to be careful when you synchronize the threads.
In my case each thread received the input for its problem and returned to the main thread the output. It works without any observable overhead.

One way is to clear your session if you want to train or load multiple models in succession. You can easily do this using
from keras import backend as K
# load and use model 1
K.clear_session()
# load and use model 2
K.clear_session()`
K.clear_session() destroys the current TF graph and creates a new one.
Useful to avoid clutter from old models / layers.

Related

Training seq2seq model on Google Colab TPU with big dataset - Keras

I'm trying to train a sequence to sequence model for machine translation using Keras on Google Colab TPU.
I have a dataset which I can load in memory but I have to preprocess to it to feed it to the model. In particular I need to convert the target words to one hot vectors and with many examples I can't load the entire conversion in memory, so I need to make batches of data.
I'm using this function as a batch generator:
def generate_batch_bert(X_ids, X_masks, y, batch_size = 1024):
''' Generate a batch of data '''
while True:
for j in range(0, len(X_ids), batch_size):
# batch of encoder and decoder data
encoder_input_data_ids = X_ids[j:j+batch_size]
encoder_input_data_masks = X_masks[j:j+batch_size]
y_decoder = y[j:j+batch_size]
# decoder target and input for teacher forcing
decoder_input_data = y_decoder[:,:-1]
decoder_target_seq = y_decoder[:,1:]
# batch of decoder target data
decoder_target_data = to_categorical(decoder_target_seq, vocab_size_fr)
# keep only with the right amount of instances for training on TPU
if encoder_input_data_ids.shape[0] == batch_size:
yield([encoder_input_data_ids, encoder_input_data_masks, decoder_input_data], decoder_target_data)
The problem is that whenever I try to run the fit function as follows:
model.fit(x=generate_batch_bert(X_train_ids, X_train_masks, y_train, batch_size = batch_size),
steps_per_epoch = train_samples//batch_size,
epochs=epochs,
callbacks = callbacks,
validation_data = generate_batch_bert(X_val_ids, X_val_masks, y_val, batch_size = batch_size),
validation_steps = val_samples//batch_size)
I get the following error:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_util.py:445 make_tensor_proto
raise ValueError("None values not supported.")
ValueError: None values not supported.
Not sure what's wrong and how I can solve this problem.
EDIT
I tried loading less amount of data in memory so that the conversion to one hot encoding of the target words doesn't crash the kernel and it actually works. So there is obviously something wrong on how I generate batches.
It's hard to tell what's wrong since you don't provide your model
definition nor any sample data. However, I'm fairly certain that you're
running into the same
TensorFlow bug
that I recently got bitten by.
The workaround is to use the tensorflow.data API which works much
better with TPUs. Like this:
from tensorflow.data import Dataset
import tensorflow as tf
def map_fn(X_id, X_mask, y):
decoder_target_data = tf.one_hot(y[1:], vocab_size_fr)
return (X_id, X_mask, y[:-1]), decoder_target_data
...
X_ids = Dataset.from_tensor_slices(X_ids)
X_masks = Dataset.from_tensor_slices(X_masks)
y = Dataset.from_tensor_slices(y)
ds = Dataset.zip((X_ids, X_masks, y)).map(map_fn).batch(1024)
model.fit(x = ds, ...)

How to create two graphs for train and validation?

When I read tensorflow guidance about graph and session(Graphs and Sessions), I found they suggest to create two graphs for train and validation.
I think this reasonable and I want to use this because my train and validation models are different (for encoder-decoder mode or dropout). However, i don't know how to make variables in trained graph available for test graph without using tf.saver().
When I create two graphs and create variables inside each graph, I found these two variables are totally different as they belong to different graphs.
I have googled a lot and I know there are questions about this problems, such as question1. But there is still no useful answer. If there is any code example or anyone know how to create two graphs for train and validation separately, such as:
def train_model():
g_train = tf.graph()
with g_train.as_default():
train_models
def validation_model():
g_test = tf.graph()
with g_test.as_default():
test_models
One easy way of doing that is to create a 'forward function' that defines the model and change behaviour based on extra parameters.
Here is an example:
def forward_pass(x, is_training, reuse=tf.AUTO_REUSE, name='model_forward_pass'):
# Note the reuse attribute as it tells the getter to either create the graph or get the weights
with tf.variable_scope(name=name, reuse=reuse):
x = tf.layers.conv(x, ...)
...
x = tf.layers.dense(x, ...)
x = tf.layers.dropout(x, rate, training=is_training) # Note the is_training attribute
...
return x
Now you can call the 'forward_pass' function anywhere in your code. You simply need to provide the is_training attribute to use the correct mode for dropout for example. The 'reuse' argument will automatically get the correct values for your weights as long as the 'name' of the 'variable_scope' is the same.
For example:
train_logits_model1 = forward_pass(x_train, is_training=True, name='model1')
# Graph is defined and dropout is used in training mode
test_logits_model1 = forward_pass(x_test, is_training=False, name='model1')
# Graph is reused but the dropout behaviour change to inference mode
train_logits_model2 = forward_pass(x_train2, is_training=True, name='model2')
# Name changed, model2 is added to the graph and dropout is used in training mode
To add to this answer as you stated that you want to have 2 separated graph, you could to that using an assign function:
train_graph = forward_pass(x, is_training=True, reuse=False, name='train_graph')
...
test_graph = forward_pass(x, is_training=False, reuse=False, name='test_graph')
...
train_vars = tf.get_collection('variables', 'train_graph/.*')
test_vars = tf.get_collection('variables','test_graph/.*')
test_assign_ops = []
for test, train in zip(test_vars, train_vars):
test_assign_ops += [tf.assign(test, train)]
assign_op = tf.group(*test_assign_ops)
sess.run(assign_op) # Replace vars in the test_graph by the one in train_graph
I'm a big advocate of method 1 as it is way cleaner and reduce memory usage.

Can not save model using model.save following multi_gpu_model in Keras

Following the upgrade to Keras 2.0.9, I have been using the multi_gpu_model utility but I can't save my models or best weights using
model.save('path')
The error I get is
TypeError: can’t pickle module objects
I suspect there is some problem gaining access to the model object. Is there a work around this issue?
To be honest, the easiest approach to this is to actually examine the multi gpu parallel model using
parallel_model.summary()
(The parallel model is simply the model after applying the multi_gpu function). This clearly highlights the actual model (in I think the penultimate layer - I am not at my computer right now). Then you can use the name of this layer to save the model.
model = parallel_model.get_layer('sequential_1)
Often its called sequential_1 but if you are using a published architecture, it may be 'googlenet' or 'alexnet'. You will see the name of the layer from the summary.
Then its simple to just save
model.save()
Maxims approach works, but its overkill I think.
Rem: you will need to compile both the model, and the parallel model.
Workaround
Here's a patched version that doesn't fail while saving:
from keras.layers import Lambda, concatenate
from keras import Model
import tensorflow as tf
def multi_gpu_model(model, gpus):
if isinstance(gpus, (list, tuple)):
num_gpus = len(gpus)
target_gpu_ids = gpus
else:
num_gpus = gpus
target_gpu_ids = range(num_gpus)
def get_slice(data, i, parts):
shape = tf.shape(data)
batch_size = shape[:1]
input_shape = shape[1:]
step = batch_size // parts
if i == num_gpus - 1:
size = batch_size - step * i
else:
size = step
size = tf.concat([size, input_shape], axis=0)
stride = tf.concat([step, input_shape * 0], axis=0)
start = stride * i
return tf.slice(data, start, size)
all_outputs = []
for i in range(len(model.outputs)):
all_outputs.append([])
# Place a copy of the model on each GPU,
# each getting a slice of the inputs.
for i, gpu_id in enumerate(target_gpu_ids):
with tf.device('/gpu:%d' % gpu_id):
with tf.name_scope('replica_%d' % gpu_id):
inputs = []
# Retrieve a slice of the input.
for x in model.inputs:
input_shape = tuple(x.get_shape().as_list())[1:]
slice_i = Lambda(get_slice,
output_shape=input_shape,
arguments={'i': i,
'parts': num_gpus})(x)
inputs.append(slice_i)
# Apply model on slice
# (creating a model replica on the target device).
outputs = model(inputs)
if not isinstance(outputs, list):
outputs = [outputs]
# Save the outputs for merging back together later.
for o in range(len(outputs)):
all_outputs[o].append(outputs[o])
# Merge outputs on CPU.
with tf.device('/cpu:0'):
merged = []
for name, outputs in zip(model.output_names, all_outputs):
merged.append(concatenate(outputs,
axis=0, name=name))
return Model(model.inputs, merged)
You can use this multi_gpu_model function, until the bug is fixed in keras. Also, when loading the model, it's important to provide the tensorflow module object:
model = load_model('multi_gpu_model.h5', {'tf': tf})
How it works
The problem is with import tensorflow line in the middle of multi_gpu_model:
def multi_gpu_model(model, gpus):
...
import tensorflow as tf
...
This creates a closure for the get_slice lambda function, which includes the number of gpus (that's ok) and tensorflow module (not ok). Model save tries to serialize all layers, including the ones that call get_slice and fails exactly because tf is in the closure.
The solution is to move import out of multi_gpu_model, so that tf becomes a global object, though still needed for get_slice to work. This fixes the problem of saving, but in loading one has to provide tf explicitly.
It's something that need a little work around by loading the multi_gpu_model weight to the regular model weight.
e.g.
#1, instantiate your base model on a cpu
with tf.device("/cpu:0"):
model = create_model()
#2, put your model to multiple gpus, say 2
multi_model = multi_gpu_model(model, 2)
#3, compile both models
model.compile(loss=your_loss, optimizer=your_optimizer(lr))
multi_model.compile(loss=your_loss, optimizer=your_optimizer(lr))
#4, train the multi gpu model
# multi_model.fit() or multi_model.fit_generator()
#5, save weights
model.set_weights(multi_model.get_weights())
model.save(filepath=filepath)
`
refrence: https://github.com/fchollet/keras/issues/8123

How do I change the Signatures of my SavedModel without retraining the model?

I just finished training my model only to find out that I exported a model for serving that had problems with the signatures. How do I update them?
(One common problem is setting the wrong shape for CloudML Engine).
Don't worry -- you don't need to retrain your model. That said, there is a little work to be done. You're going to create a new (corrected) serving graph, load the checkpoints into that graph, and then export this graph.
For example, suppose you add a placeholder, but didn't set the shape, even though you meant to (e.g., to run on CloudML). In that case, your graph may have looked like:
x = tf.placeholder(tf.float32)
y = foo(x)
...
To correct this:
# Create the *correct* graph
with tf.Graph().as_default() as new_graph:
x = tf.placeholder(tf.float32, shape=[None])
y = foo(x)
saver = tf.train.Saver()
# (Re-)define the inputs and the outputs.
inputs = {"x": tf.saved_model.utils.build_tensor_info(x)}
outputs = {"y": tf.saved_model.utils.build_tensor_info(y)}
signature = tf.saved_model.signature_def_utils.build_signature_def(
inputs=inputs,
outputs=outputs,
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
)
with tf.Session(graph=new_graph) as session:
# Restore the variables
vars_path = os.path.join(old_export_dir, 'variables', 'variables')
saver.restore(session, vars_path)
# Save out the corrected model
b = builder.SavedModelBuilder(new_export_dir)
b.add_meta_graph_and_variables(session, ['serving_default'], signature)
b.save()

Tensorflow (tf-slim) Model with is_training True and False

I would like to run a given model both on the train set (is_training=True) and on the validation set (is_training=False), specifically with how dropout is applied. Right now the prebuilt models expose a parameter is_training that is passed it the dropout layer when building the network. The issue is that If I call the method twice with different values of is_training, I will get two different networks that do no share weights (I think?). How do I go about getting the two networks to share the same weights such that I can run the network that I have trained on the validation set?
I wrote a solution with your comment to use Overfeat in train and test mode. (I couldn't test it so you can check if it works?)
First some imports and parameters:
import tensorflow as tf
slim = tf.contrib.slim
overfeat = tf.contrib.slim.nets.overfeat
batch_size = 32
inputs = tf.placeholder(tf.float32, [batch_size, 231, 231, 3])
dropout_keep_prob = 0.5
num_classes = 1000
In train mode, we pass a normal scope to the function overfeat:
scope = 'overfeat'
is_training = True
output = overfeat.overfeat(inputs, num_classes, is_training,
dropout_keep_prob, scope=scope)
Then in test mode, we create the same scope but with reuse=True.
scope = tf.VariableScope(reuse=True, name='overfeat')
is_training = False
output = overfeat.overfeat(inputs, num_classes, is_training,
dropout_keep_prob, scope=scope)
you can just use a placeholder for is_training:
isTraining = tf.placeholder(tf.bool)
# create nn
net = ...
net = slim.dropout(net,
keep_prob=0.5,
is_training=isTraining)
net = ...
# training
sess.run([net], feed_dict={isTraining: True})
# testing
sess.run([net], feed_dict={isTraining: False})
It depends on the case, the solutions are different.
My first option would be to use a different process to do the evaluation. You only need to check that there is a new checkpoint and load that weights into the evaluation network (with is_training=False):
checkpoint = tf.train.latest_checkpoint(self.checkpoints_path)
# wait until a new check point is available
while self.lastest_checkpoint == checkpoint:
time.sleep(30) # sleep 30 seconds waiting for a new checkpoint
checkpoint = tf.train.latest_checkpoint(self.checkpoints_path)
logging.info('Restoring model from {}'.format(checkpoint))
self.saver.restore(session, checkpoint)
self.lastest_checkpoint = checkpoint
The second option is after every epoch you unload the graph and create a new evaluation graph. This solution waste a lot of time loading and unloading graphs.
The third option is to share the weights. But feeding these networks with queues or dataset can lead to issues, so you have to be very careful. I only use this for Siamese networks.
with tf.variable_scope('the_scope') as scope:
your_model(is_training=True)
scope.reuse_variables()
your_model(is_training=False)