What is the correct way to load a Tensorflow model from a checkpoint but reset the learning_rate to start from the start instead of continuing from the checkpoint?
I am using the following code for initializing the optimizer.
learning_rate = tf.train.polynomial_decay(start_learning_rate, self.global_step,
decay_steps, end_learning_rate,
power=power,name="new_one2")
opt = tf.train.AdamOptimizer(learning_rate)
I tried to restore var_list from the checkpoint and return only those variables whose name does no have Adam so that the AdamOptimizer is reinitialized from the start. But this doesn't help. Here's the function I use.
def optimistic_restore(self, save_file, ignore_vars=None, verbose=False, ignore_incompatible_shapes=False):
"""This function tries to restore all variables in the save file.
This function ignores variables that do not exist or have incompatible shape.
Raises TypeError if the there is a type mismatch for compatible shapes.
session: tf.Session
The tf session
save_file: str
Path to the checkpoint without the .index, .meta or .data extensions.
ignore_vars: list, tuple or set of str
These variables will be ignored.
verbose: bool
If True prints which variables will be restored
ignore_incompatible_shapes: bool
If True ignores variables with incompatible shapes.
If False raises a runtime error f shapes are incompatible.
"""
def vprint(*args, **kwargs):
if verbose: print(*args, flush=True, **kwargs)
# def dbg(*args, **kwargs): print(*args, flush=True, **kwargs)
def dbg(*args, **kwargs): pass
if ignore_vars is None:
ignore_vars = []
reader = tf.train.NewCheckpointReader(save_file)
var_to_shape_map = reader.get_variable_to_shape_map()
var_list = []
for key in sorted(var_to_shape_map):
if not 'Adam' in key:
var_list.append(key)
return var_list
This will give me all the variables except the Adam ones. And than I pass the resulting list of variables to the saver, as mentioned below:
varsss = self.optimistic_restore(tf.train.latest_checkpoint(my_ckpt_dir))
b = tf.global_variables()
tf.train.Saver(var_list=b[:len(varsss)])
But this still doesn't help - my learning rate doesn't go back to start.
If someone else is also going through the problem.
The solution was rather simple. I just added the global_step to my polynomial_decay learning rate as a placeholder. i.e
custom_lr = tf.placeholder(tf.int32)
learning_rate = tf.train.polynomial_decay(start_learning_rate, custom_lr,
decay_steps, end_learning_rate,
power=power,name="new_one2")
And simply pass the learning_rate in the feed dict.
for step in range(0, 1000):
sess.run(feed_dict={ custom_lr: step }
Related
I am getting an error, when I run the below code.
def __init__(self, model, sess, loss_fn=None):
"""
To generate the White-box Attack Agent.
:param model: the target model which should have the input tensor, the target tensor and the loss tensor.
:param sess: the tensorflow session.
:param loss_fn: None if using original loss of the model.
"""
self.model = model
self.input_tensor = model.inputs[0]
self.output_tensor = model.outputs[0]
self.target_tensor = model.targets[0]
self._sample_weights = model.sample_weights[0]
if loss_fn is None:
self.loss_tensor = model.total_loss
self.gradient_tensor = K.gradients(self.loss_tensor, self.input_tensor)[0]
else:
self.set_loss_function(loss_fn)
self.sess = sess
error:
self.target_tensor = model.targets[0] , AttributeError: 'Model' object has no attribute 'targets'
I am working with Tensorflow 1.14.0 ,keras 2.2.4-tf and python 3.6.13.how can I resolve this problem?
Thank you
The model argument that your function is using is apparently talking about a unique class that isn't part of the default Keras or Tensorflow. You need to make sure that the model object you're passing the function is an instance of the appropriate class. Or, alternatively, you can jerry rig it by doing something like this:
# whatever code instantiates the `model` would be here
...
...
model.targets = [ "targets of an appropriate structure and type go here" ]
Then when you pass that model object to the function, it will have a targets attribute to be accessed. But it's better to just use the right object in the first place.
Or, the third option: just comment out the offending line and hope it isn't actually used anywhere (if it's being set, it's probably used somewhere or other though).
def __init__(self, model, sess, loss_fn=None):
"""
To generate the White-box Attack Agent.
:param model: the target model which should have the input tensor, the target tensor and the loss tensor.
:param sess: the tensorflow session.
:param loss_fn: None if using original loss of the model.
"""
self.model = model
self.input_tensor = model.inputs[0]
self.output_tensor = model.outputs[0]
#self.target_tensor = model.targets[0]
self._sample_weights = model.sample_weights[0]
if loss_fn is None:
self.loss_tensor = model.total_loss
self.gradient_tensor = K.gradients(self.loss_tensor, self.input_tensor)[0]
else:
self.set_loss_function(loss_fn)
self.sess = sess
I have tried all the options described in the documentation but none of them allowed me to save my model in tensorflow 2.0.0 beta1. I've also tried to upgrade to the (also unstable) TF2-RC but that ruined even the code I had working in beta so I quickly rolled back for now to beta.
See a minimal reproduction code below.
What I have tried:
model.save("mymodel.h5")
NotImplementedError: Saving the model to HDF5 format requires the
model to be a Functional model or a Sequential model. It does not work
for subclassed models, because such models are defined via the body of
a Python method, which isn't safely serializable. Consider saving to
the Tensorflow SavedModel format (by setting save_format="tf") or
using save_weights.
model.save("mymodel", format='tf')
ValueError: Model <main.CVAE object at 0x7f1cac2e7c50> cannot be
saved because the input shapes have not been set. Usually, input
shapes are automatically determined from calling .fit() or .predict().
To manually set the shapes, call model._set_inputs(inputs).
3.
model._set_input(input_sample)
model.save("mymodel", format='tf')
AssertionError: tf.saved_model.save is not supported inside a traced
#tf.function. Move the call to the outer eagerly-executed context.
And this is where I am stuck now because it gives me no reasonable hint whatsoever. That's because I am NOT calling the save() function from a #tf.function, I'm already calling it from the outermost scope possible. In fact, I have no #tf.function at all in this minimal reproduction script below and still getting the same error.
So I really have no idea how to save my model, I've tried every options and they all throw errors and provide no hints.
The minimal reproduction example below works fine if you set save_model=False and it reproduces the error when save_model=True.
It may seem unnecessary in this simplified auto-encoder code example to use a subclassed model but I have lots of custom functions added to it in my original VAE code that I need it for.
Code:
import tensorflow as tf
save_model = True
learning_rate = 1e-4
BATCH_SIZE = 100
TEST_BATCH_SIZE = 10
color_channels = 1
imsize = 28
(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()
train_images = train_images[:5000, ::]
test_images = train_images[:1000, ::]
train_images = train_images.reshape(-1, imsize, imsize, 1).astype('float32')
test_images = test_images.reshape(-1, imsize, imsize, 1).astype('float32')
train_images /= 255.
test_images /= 255.
train_dataset = tf.data.Dataset.from_tensor_slices(train_images).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices(test_images).batch(TEST_BATCH_SIZE)
class AE(tf.keras.Model):
def __init__(self):
super(AE, self).__init__()
self.network = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=(imsize, imsize, color_channels)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(50),
tf.keras.layers.Dense(imsize**2 * color_channels),
tf.keras.layers.Reshape(target_shape=(imsize, imsize, color_channels)),
])
def decode(self, input):
logits = self.network(input)
return logits
optimizer = tf.keras.optimizers.Adam(learning_rate)
model = AE()
def compute_loss(data):
logits = model.decode(data)
loss = tf.reduce_mean(tf.losses.mean_squared_error(logits, data))
return loss
def train_step(data):
with tf.GradientTape() as tape:
loss = compute_loss(data)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return loss, 0
def test_step(data):
loss = compute_loss(data)
return loss
input_shape_set = False
epoch = 0
epochs = 20
for epoch in range(epochs):
for train_x in train_dataset:
train_step(train_x)
if epoch % 1 == 0:
loss = 0.0
num_batches = 0
for test_x in test_dataset:
loss += test_step(test_x)
num_batches += 1
loss /= num_batches
print("Epoch: {}, Loss: {}".format(epoch, loss))
if save_model:
print("Saving model...")
if not input_shape_set:
# Note: Why set input shape manually and why here:
# 1. If I do not set input shape manually: ValueError: Model <main.CVAE object at 0x7f1cac2e7c50> cannot be saved because the input shapes have not been set. Usually, input shapes are automatically determined from calling .fit() or .predict(). To manually set the shapes, call model._set_inputs(inputs).
# 2. If I set input shape manually BEFORE the first actual train step, I get: RuntimeError: Attempting to capture an EagerTensor without building a function.
model._set_inputs(train_dataset.__iter__().next())
input_shape_set = True
# Note: Why choose tf format: model.save('MNIST/Models/model.h5') will return NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using save_weights.
model.save('MNIST/Models/model', save_format='tf')
I have tried the same minimal reproduction example in tensorflow-gpu 2.0.0-rc0 and the error was more revealing than what the beta version gave me. The error in RC says:
NotImplementedError: When subclassing the Model class, you should
implement a call method.
This got me read through https://www.tensorflow.org/beta/guide/keras/custom_layers_and_models where I found examples of how to do subclassing in TF2 in a way that allows saving. I was able to resolve the error and have the model saved by replacing my 'decode' method by 'call' in the above example (although this will be more complicated with my actual code where I had various methods defined for the class). This solved the error both in beta and in rc. Strangely, the training (or the saving) got also much faster in rc.
You should change two things:
Change the decode method to call, as you pointed out
As your model is of type Sequential, and not built inside the class, you want to call the save method on the self.network attribute of the model, i.e.,
model.network.save('mymodel.h5')
alternatively, to keep things more standard, you can implement this method inside the AE class, as follows:
def save(self, save_dir):
self.network.save(save_dir)
Cheers mate
I'm trying to load a pre-trained tensorflow object detection model from the Tensorflow Object Detection repo as a tf.estimator.Estimator and use it to make predictions.
I'm able to load the model and run inference using Estimator.predict(), however the output is garbage. Other methods of loading the model, e.g. as a Predictor, and running inference work fine.
Any help properly loading a model as an Estimator calling predict() would be much appreciated. My current code:
Load and prepare image
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(list(image.getdata())).reshape((im_height, im_width, 3)).astype(np.uint8)
image_url = 'https://i.imgur.com/rRHusZq.jpg'
# Load image
response = requests.get(image_url)
image = Image.open(BytesIO(response.content))
# Format original image size
im_size_orig = np.array(list(image.size) + [1])
im_size_orig = np.expand_dims(im_size_orig, axis=0)
im_size_orig = np.int32(im_size_orig)
# Resize image
image = image.resize((np.array(image.size) / 4).astype(int))
# Format image
image_np = load_image_into_numpy_array(image)
image_np_expanded = np.expand_dims(image_np, axis=0)
image_np_expanded = np.float32(image_np_expanded)
# Stick into feature dict
x = {'image': image_np_expanded, 'true_image_shape': im_size_orig}
# Stick into input function
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
x=x,
y=None,
shuffle=False,
batch_size=128,
queue_capacity=1000,
num_epochs=1,
num_threads=1,
)
Side note:
train_and_eval_dict also seems to contain an input_fn for prediction
train_and_eval_dict['predict_input_fn']
However this actually returns a tf.estimator.export.ServingInputReceiver, which I'm not sure what to do with. This could potentially be the source of my problems as there's a fair bit of pre-processing involved before the model actually sees the image.
Load model as Estimator
Model downloaded from TF Model Zoo here, code to load model adapted from here.
model_dir = './pretrained_models/tensorflow/ssd_mobilenet_v1_coco_2018_01_28/'
pipeline_config_path = os.path.join(model_dir, 'pipeline.config')
config = tf.estimator.RunConfig(model_dir=model_dir)
train_and_eval_dict = model_lib.create_estimator_and_inputs(
run_config=config,
hparams=model_hparams.create_hparams(None),
pipeline_config_path=pipeline_config_path,
train_steps=None,
sample_1_of_n_eval_examples=1,
sample_1_of_n_eval_on_train_examples=(5))
estimator = train_and_eval_dict['estimator']
Run inference
output_dict1 = estimator.predict(predict_input_fn)
This prints out some log messages, one of which is:
INFO:tensorflow:Restoring parameters from ./pretrained_models/tensorflow/ssd_mobilenet_v1_coco_2018_01_28/model.ckpt
So it seems like pre-trained weights are getting loaded. However results look like:
Load same model as a Predictor
from tensorflow.contrib import predictor
model_dir = './pretrained_models/tensorflow/ssd_mobilenet_v1_coco_2018_01_28'
saved_model_dir = os.path.join(model_dir, 'saved_model')
predict_fn = predictor.from_saved_model(saved_model_dir)
Run inference
output_dict2 = predict_fn({'inputs': image_np_expanded})
Results look good:
When you load the model as an estimator and from a checkpoint file, here is the restore function associated with ssd models. From ssd_meta_arch.py
def restore_map(self,
fine_tune_checkpoint_type='detection',
load_all_detection_checkpoint_vars=False):
"""Returns a map of variables to load from a foreign checkpoint.
See parent class for details.
Args:
fine_tune_checkpoint_type: whether to restore from a full detection
checkpoint (with compatible variable names) or to restore from a
classification checkpoint for initialization prior to training.
Valid values: `detection`, `classification`. Default 'detection'.
load_all_detection_checkpoint_vars: whether to load all variables (when
`fine_tune_checkpoint_type='detection'`). If False, only variables
within the appropriate scopes are included. Default False.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
Raises:
ValueError: if fine_tune_checkpoint_type is neither `classification`
nor `detection`.
"""
if fine_tune_checkpoint_type not in ['detection', 'classification']:
raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
fine_tune_checkpoint_type))
if fine_tune_checkpoint_type == 'classification':
return self._feature_extractor.restore_from_classification_checkpoint_fn(
self._extract_features_scope)
if fine_tune_checkpoint_type == 'detection':
variables_to_restore = {}
for variable in tf.global_variables():
var_name = variable.op.name
if load_all_detection_checkpoint_vars:
variables_to_restore[var_name] = variable
else:
if var_name.startswith(self._extract_features_scope):
variables_to_restore[var_name] = variable
return variables_to_restore
As you can see even if the config file sets from_detection_checkpoint: True, only the variables in the feature extractor scope will be restored. To restore all the variables, you will have to set
load_all_detection_checkpoint_vars: True
in the config file.
So, the above situation is quite clear. When load the model as an Estimator, only the variables from feature extractor scope will be restored, and the predictors's scope weights are not restored, the estimator would obviously give random predictions.
When load the model as a predictor, all weights are loaded thus the predictions are reasonable.
My question is about context and the TensorFlow default sessions and graph.
The problem:
Tensorflow is unable to feed a placeholder in the following scenario:
Function Test defines a graph.
Function Test_Once defines a session.
When Function Test calls Test_Once -> Feeding fails.
When I change the code so function Test declares the graph + the session -> all is working.
Here is the code:
def test_once(g, saver, summary_writer, logits, images, summary_op):
"""Run a session once for a givven test image.
Args:
saver: Saver.
summary_writer: Summary writer.
logits:
summary_op: Summary op.
"""
with tf.Session(graph=g) as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
else:
print('No checkpoint file found')
return
images.astype(np.float32)
predictions = sess.run(logits, feed_dict={'InputPlaceHolder/TestInput:0':images})
summary = tf.Summary()
summary.ParseFromString(sess.run(summary_op))
summary_writer.add_summary(summary, global_step)
return (predictions)
def test():
"""Test LCPR with a test image"""
with tf.Graph().as_default() as g:
# Get image for testing
images, labels = lcpr.test_input()
# Build a Graph that computes the logits predictions from the
# inference model.
with tf.name_scope('InputPlaceHolder'):
test_image_placeholder = tf.placeholder(tf.float32, (None,None,None,3), 'TestInput')
# Display the training images in the visualizer.
# The 'max_outputs' default is 3. Not stated. (Max number of batch elements to generate images for.)
#tf.summary.image('input_images', test_image_placeholder)
with tf.name_scope('Inference'):
logits = lcpr.inference(test_image_placeholder)
# Restore the moving average version of the learned variables for eval.
variable_averages = tf.train.ExponentialMovingAverage(
lcpr.MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
# Build the summary operation based on the TF collection of Summaries.
writer = tf.summary.FileWriter("/tmp/lcpr/test")
writer.add_graph(g)
summary_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(FLAGS.test_dir, g)
#Sadly, this will not work:
predictions = test_once(g, saver, summary_writer, logits, images, summary_op)
'''Alternative working option :
with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
# Assuming model_checkpoint_path looks something like:
# /my-favorite-path/cifar10_train/model.ckpt-0,
# extract global_step from it.
global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
else:
print('No checkpoint file found')
return
x = sess.run(logits, feed_dict={'InputPlaceHolder/TestInput:0':images})
print(x)
'''
The above code yeilds an error that the placeholder is not fed:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'InputPlaceHolder/TestInput' with dtype float
And it's not that TensorFlow does not recognize the placeholder. If I change the name from 'InputPlaceHolder/TestInput:0' to 'InputPlaceHolder/TestInput:1' I receive a message calming that 'InputPlaceHolder/TestInput' exists but has only 1 output. This makes sense, and I guess the session runs on my default graph.
Things only work for me if I stay within the same def:
If I change the code by running the commented part (starting ' with tf.Session() as sess:) directly from within the first function all works.
I wonder what am I missing?
My guess that is context related, maybe not assigning the session to the graph?
Solved. Stupid mistake
test_once calls sess.run twice. On the second time, indeed no placeholder is fed.... : summary.ParseFromString(sess.run(summary_op))
tensorflow is telling me that the momentum part of a variable is uninitiated when I use the momentum optimizer. When I use the gradientDescent optimizer things work fine.
Here is a relevant part of the stack trace:
tensorflow.python.framework.errors.FailedPreconditionError: Attempting to use uninitialized value fc3/biases/Momentum
[[Node: Momentum/update_fc3/biases/ApplyMomentum = ApplyMomentum[T=DT_FLOAT, _class=["loc:#fc3/biases"], use_locking=false, _device="/job:localhost/replica:0/task:0/cpu:0"](fc3/biases, fc3/biases/Momentum, Momentum/learning_rate, gradients/fc3/logits_grad/tuple/control_dependency_1, Momentum/momentum)]]
Caused by op u'Momentum/update_fc3/biases/ApplyMomentum', defined at:
...
train_op = vgg.optimizer.minimize(vgg.loss, global_step=vgg.global_step)
I think the code is correct, it defines the ops for all layers before the initialize all variables op, etc. If not the GradientDescent optimizer wouldn't work right?
Following up on #etarion comment, here is a sketch of the code, starts with
def train(args):
datareader = # object to read data - no tensorflow code/import
with tf.Graph().as_default():
with_graph(datareader, args)
then with_graph does
def with_graph(datareader, args):
num_outputs = datareader.num_outputs()
img_orig = tf.placeholder(tf.float32, shape=datareader.features_placeholder_shape())
img_vgg16 = preprocess.imgbatch_2_vgg16(imgs=img_orig, channel_mean=8.46)
labels_placeholder = tf.placeholder(tf.float32, shape=(None, num_outputs))
vgg = vgg16(imgs=img_vgg16, weights=None, sess=None, trainable=args.trainable, stop_at_fc2=args.fc2)
add_loss(vgg, labels_placeholder, num_outputs, args)
add_optimizer(vgg, args)
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads = 12))
init = tf.initialize_all_variables()
sess.run(init)
validation_imgs_orig, validation_labels = datareader.get_validation_set()
validation_imgs_vgg16 = sess.run(img_vgg16, {img_orig: validation_imgs_orig})
validation_feed_dict = {img_vgg16:validation_imgs_vgg16,
labels_placeholder:validation_labels}
train_op = vgg.optimizer.minimize(vgg.loss, global_step=vgg.global_step)
print("Starting training.")
sys.stdout.flush()
for step_number in range(3):
t0 = time.time()
train_imgs, train_labels = datareader.get_next_minibatch()
train_feed_dict = {img_orig: train_imgs,
labels_placeholder:train_labels}
sess.run(train_op, feed_dict=train_feed_dict)
print("step %3d took %.2f sec." % (step_number, time.time()-t0))
sys.stdout.flush()
The gradient descent optimizer does not have internal variables, the momentum one has. Somehow you don't initialize the state of the momentum (can't tell why exactly without the code). Ways to do that are initializing all variables right before you run the graph (after you added the optimizer to the graph), or, if you want to be explicit in what you initialize, use the get_slot_names()/get_slot() methods of the optimizer to get the Variables that make up the optimizer's internal state.
The problem is I was defining the training op from the optimizer minimize function after initializing all the variables - once I moved that in front of initialize all variables it worked.