Trying to create optimizer slot variable under the scope for tf.distribute.Strategy which is different from the scope used for the original variable - tensorflow

I'm trying to train, save and load a tensorflow model. An outline of my code is as follows:
devices = ["/device:GPU:{}".format(i) for i in range(num_gpus)]
strategy = tf.distribute.MirroredStrategy(devices)
with strategy.scope():
# Create model
model = my_model(some parameters)
model.build((parameters))
model.summary()
# Adam optimizer with EMA smoothing
opt = tf.keras.optimizers.Adam(learning_rate)
opt = tfa.optimizers.MovingAverage(opt, average_decay=ema_decay)
model.compile(
optimizer=opt,
loss=loss_dict,
loss_weights=loss_weights,
metrics=metrics_dict)
#adding softmax layer
output_list = list()
for i in range(num_class):
output_list.append(tf.keras.layers.Softmax(name=f"name_{str(i)}")(model.output[i]))
output_list.append(model.output[num_class])
model_b = tf.keras.Model(inputs=model.input, outputs=output_list)
model_b.build((None, None, feats_dim, 1))
model_b.compile(optimizer=opt)
model.fit(parameters... callbacks=[cp_callback, logger_callback, tb_callback])
model_b.load_weights(checkpoint_path)
model_b.save(os.path.join(model_path, "model.h5"))
The checkpoints are saved using:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_path, verbose=1, save_weights_only=True, period=1
)
When arriving at the line where the weights are loaded, I receive the following error:
ValueError: Trying to create optimizer slot variable under the scope for
tf.distribute.Strategy
(<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy
object at 0x7fb6047c3c50>), which is different from the scope used for the original
variable (MirroredVariable:{
Any help would be appreciated

According to this model-saving APIs create variables (which should be distributed variables). Try creating/calling the callbacks from within the strategy scope.

Related

MLFLOW on Databricks - Cannot log a Keras model as a mlflow.pyfunc model. Get TypeError: cannot pickle 'weakref' object

Hi all: this is one of my first posts on Stackoverflow - so apologies in advance if i'm not conforming to certain standards!
I'm having trouble saving my Keras model as a mlflow.pyfunc model as it's giving me a "cannot pickle a 'weakref' object when I try to log it.
So why am i saving my Keras model as a pyfunc model object in the first place? This is because I want to override the default predict method and output something custom. I also want to do some pre-processing steps on the X_test or new data by encoding it with a tf.keras.StringLookup and then invert it back to get the original categorical variable class. For this reason, I was advised by Databricks that the mlflow.pyfunc flavor is the best way to go for these types of use-cases
The Keras model works just fine and i'm able to log it using mlflow.keras.log_model. But it fails when i try to wrap it inside a cutomer "KerasWrapper" class.
Here are some snippets of my code. For the purpose of debugging, the current predict method in the custom class is just the default. I simplified it to help debug, but obviously I haven't been able to resolve it.
I would be extremely grateful for any help. Thanks in advance!
ALL CODE ON AZURE DATABRICKS
Custom mlflow.pyfunc class
class KerasWrapper(mlflow.pyfunc.PythonModel):
def __init__(self, keras_model, labelEncoder, labelDecoder, n):
self.keras_model = keras_model
self.labelEncoder = labelEncoder
self.labelDecoder = labelDecoder
self.topn = n
def load_context(self, context):
self.keras_model = mlflow.keras.load_model(model_uri=context.artifacts[self.keras_model], compile=False)
def predict(self, context, input_data):
scores = self.keras_model.predict(input_data)
return scores
My Keras Deep Learning Model (this works fine by the way)
def build_model(vocab_size, steps, drop_embed, n_dim, encoder, modelType):
model = None
i = Input(shape=(None,), dtype="int64")
#embedding layer
e = Embedding(vocab_size, 16)(i)
s = SpatialDropout1D(drop_embed)(e)
x = Conv1D(256, steps, activation='relu')(s)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
#output layer
x = Dense(vocab_size, activation='softmax')(x)
model = Model(i, x)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
MLFLOW Section
with mlflow.start_run(run_name=runName):
mlflow.tensorflow.autolog()
#Build the model, compile and train on the training set
#signature: build_model(vocab_size, steps, drop_embed, n_dim, encoder, modelType):
keras_model = build_model((vocab_size + 1), timeSteps, drop_embed, embedding_dimensions, encoder, modelType)
keras_model.fit(X_train_encoded, y_train_encoded, epochs=epochs, verbose=1, batch_size=32, use_multiprocessing = True,
validation_data=(X_test_encoded, y_test_encoded))
# Log the model parameters used for this run.
mlflow.log_param("numofActionsinWorkflow", numofActionsinWf)
mlflow.log_param("timeSteps", timeSteps)
#wrap it up in a pyfunc model
wrappedModel = KerasWrapper(keras_model, encoder, decoder, bestActionCount)
# Create a model signature using the tensor input to store in the MLflow model registry
signature = infer_signature(X_test_encoded, wrappedModel.predict(None, X_test_encoded))
# Let's check out how it looks
print(signature)
# Create an input example to store in the MLflow model registry
input_example = np.expand_dims(X_train[17], axis=0)
# The necessary dependencies are added to a conda.yaml file which is logged along with the model.
model_env = mlflow.pyfunc.get_default_conda_env()
# Record specific additional dependencies required by the serving model
model_env['dependencies'][-1]['pip'] += [
f'tensorflow=={tf.__version__}',
f'mlflow=={mlflow.__version__}',
f'sklearn=={sklearn.__version__}',
f'cloudpickle=={cloudpickle.__version__}',
]
#log the model to experiment
#mlflow.keras.log_model(keras_model, artifact_path = runName, signature=signature, input_example=input_example, conda_env = model_env)
wrapped_model_path = runName
if (os.path.exists(wrapped_model_path)):
shutil.rmtree(wrapped_model_path)
#Log model as pyfunc model
mlflow.pyfunc.log_model(runName, python_model=wrappedModel, signature=signature, input_example=input_example, conda_env = model_env)
#return the run ID for model registration
run_id = mlflow.active_run().info.run_id
mlflow.end_run()
Here is the error that i receive

using Estimator interface for inference with pre-trained tensorflow object detection model

I'm trying to load a pre-trained tensorflow object detection model from the Tensorflow Object Detection repo as a tf.estimator.Estimator and use it to make predictions.
I'm able to load the model and run inference using Estimator.predict(), however the output is garbage. Other methods of loading the model, e.g. as a Predictor, and running inference work fine.
Any help properly loading a model as an Estimator calling predict() would be much appreciated. My current code:
Load and prepare image
def load_image_into_numpy_array(image):
(im_width, im_height) = image.size
return np.array(list(image.getdata())).reshape((im_height, im_width, 3)).astype(np.uint8)
image_url = 'https://i.imgur.com/rRHusZq.jpg'
# Load image
response = requests.get(image_url)
image = Image.open(BytesIO(response.content))
# Format original image size
im_size_orig = np.array(list(image.size) + [1])
im_size_orig = np.expand_dims(im_size_orig, axis=0)
im_size_orig = np.int32(im_size_orig)
# Resize image
image = image.resize((np.array(image.size) / 4).astype(int))
# Format image
image_np = load_image_into_numpy_array(image)
image_np_expanded = np.expand_dims(image_np, axis=0)
image_np_expanded = np.float32(image_np_expanded)
# Stick into feature dict
x = {'image': image_np_expanded, 'true_image_shape': im_size_orig}
# Stick into input function
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
x=x,
y=None,
shuffle=False,
batch_size=128,
queue_capacity=1000,
num_epochs=1,
num_threads=1,
)
Side note:
train_and_eval_dict also seems to contain an input_fn for prediction
train_and_eval_dict['predict_input_fn']
However this actually returns a tf.estimator.export.ServingInputReceiver, which I'm not sure what to do with. This could potentially be the source of my problems as there's a fair bit of pre-processing involved before the model actually sees the image.
Load model as Estimator
Model downloaded from TF Model Zoo here, code to load model adapted from here.
model_dir = './pretrained_models/tensorflow/ssd_mobilenet_v1_coco_2018_01_28/'
pipeline_config_path = os.path.join(model_dir, 'pipeline.config')
config = tf.estimator.RunConfig(model_dir=model_dir)
train_and_eval_dict = model_lib.create_estimator_and_inputs(
run_config=config,
hparams=model_hparams.create_hparams(None),
pipeline_config_path=pipeline_config_path,
train_steps=None,
sample_1_of_n_eval_examples=1,
sample_1_of_n_eval_on_train_examples=(5))
estimator = train_and_eval_dict['estimator']
Run inference
output_dict1 = estimator.predict(predict_input_fn)
This prints out some log messages, one of which is:
INFO:tensorflow:Restoring parameters from ./pretrained_models/tensorflow/ssd_mobilenet_v1_coco_2018_01_28/model.ckpt
So it seems like pre-trained weights are getting loaded. However results look like:
Load same model as a Predictor
from tensorflow.contrib import predictor
model_dir = './pretrained_models/tensorflow/ssd_mobilenet_v1_coco_2018_01_28'
saved_model_dir = os.path.join(model_dir, 'saved_model')
predict_fn = predictor.from_saved_model(saved_model_dir)
Run inference
output_dict2 = predict_fn({'inputs': image_np_expanded})
Results look good:
When you load the model as an estimator and from a checkpoint file, here is the restore function associated with ssd models. From ssd_meta_arch.py
def restore_map(self,
fine_tune_checkpoint_type='detection',
load_all_detection_checkpoint_vars=False):
"""Returns a map of variables to load from a foreign checkpoint.
See parent class for details.
Args:
fine_tune_checkpoint_type: whether to restore from a full detection
checkpoint (with compatible variable names) or to restore from a
classification checkpoint for initialization prior to training.
Valid values: `detection`, `classification`. Default 'detection'.
load_all_detection_checkpoint_vars: whether to load all variables (when
`fine_tune_checkpoint_type='detection'`). If False, only variables
within the appropriate scopes are included. Default False.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
Raises:
ValueError: if fine_tune_checkpoint_type is neither `classification`
nor `detection`.
"""
if fine_tune_checkpoint_type not in ['detection', 'classification']:
raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
fine_tune_checkpoint_type))
if fine_tune_checkpoint_type == 'classification':
return self._feature_extractor.restore_from_classification_checkpoint_fn(
self._extract_features_scope)
if fine_tune_checkpoint_type == 'detection':
variables_to_restore = {}
for variable in tf.global_variables():
var_name = variable.op.name
if load_all_detection_checkpoint_vars:
variables_to_restore[var_name] = variable
else:
if var_name.startswith(self._extract_features_scope):
variables_to_restore[var_name] = variable
return variables_to_restore
As you can see even if the config file sets from_detection_checkpoint: True, only the variables in the feature extractor scope will be restored. To restore all the variables, you will have to set
load_all_detection_checkpoint_vars: True
in the config file.
So, the above situation is quite clear. When load the model as an Estimator, only the variables from feature extractor scope will be restored, and the predictors's scope weights are not restored, the estimator would obviously give random predictions.
When load the model as a predictor, all weights are loaded thus the predictions are reasonable.

How to use Tensorflow slim Alexnet Implementation?

I am trying to solve a image classification problem using tensorflow.contrib.slim implementation of Alexnet. If i try to create the graph without the following line of code, graph is successfully created.
valid_predicitions = tf.nn.softmax(model(tf_validation_dataset))
But when i add this line to the code i get the following error
ValueError: Variable alexnet_v2/conv1/weights already exists,
disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in
VarScope?
I want loss and accuracy for test and validation data after certain number of iterations too. My complete code is as follows
with graph.as_default():
tf_train_dataset = tf.placeholder(tf.float32, shape=(batchsize, height, width,channels))
tf_train_labels = tf.placeholder(tf.float32, shape=(batchsize, num_labels))
tf_validation_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
def model(data):
with slim.arg_scope(alexnet.alexnet_v2_arg_scope()):
outputs, end_points = alexnet.alexnet_v2(data,num_classes=num_labels)
return outputs
logits = model(tf_train_dataset)
#calculate loss
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
#optimization Step
optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
#predictions for each dataset
train_predicitions = tf.nn.softmax(logits)
valid_predicitions = tf.nn.softmax(model(tf_validation_dataset))
model() is a function that creates the alexnet each time it's called. The second time you call the function, the error message tells you that one of the variable in the alexnet was already created, which is true.
You'll have to refactor your code such that you only create the model once. I guess it means that the input to the model should be a placeholder where the batch size is not specified (None), so that you can pass it the normal training batches, or the full validation data as one big batch.

Restoring a single variable tensor saved in one model to variable tensor in another model - Tensorflow

running on tensorflow 1.3.0 GPU.
I have trained a model in TF and saved just a single varialbe tensor using:
embeddings = tf.Variable(tf.random_uniform([4**kmer_len, embedding_size], -0.04, 0.04), name='Embeddings')
more code, variables...
saver = tf.train.Saver({"Embeddings": embeddings}) # saving only embeddings variable
some more code, training model...
saver.save(ses, './embeddings/embedding_mat') # saving the variable
Now, I have a different model in a different file and I would like to resotre just the single saved embeddings variable to it. the problem is that this new model has some more variables.
Now, when I try to restore the variable by doing:
embeddings = tf.Variable(tf.random_uniform([4**kmer_len_emb, embedding_size], -0.04, 0.04), name='Embeddings')
dense1 = tf.layers.dense(inputs=kmer_flattened, units=200, activation=tf.nn.relu, use_bias=True)
ses = tf.Session()
init = tf.global_variables_initializer()
ses.run(init)
saver = tf.train.Saver()
saver.restore(ses, './embeddings/embedding_mat')
I'm getting a not found in checkpoint error.
Any thoughts on how to deal with this?
Thanks
You must create an Instance of Saver just on that variable:
saver = tf.train.Saver(var_list=[embeddings])
This is saying to your Saver instance to take care of restoring/saving only that particular variable of that graph, otherwise it will try to restore/save all the variables of the graph.
It is because it can't find the dense1 checkpoint. try this:
all_var = tf.global_variables()
var_to_restore = [v for v in all_var if v.name == 'Embeddings:0']
ses.run(init)
saver = tf.train.Saver(var_to_restore)
saver.restore(ses, './embeddings/embedding_mat')

In Tensorflow, how to use a restored meta-graph if the meta graph was feeding with TFRecord input (without placeholders)

I trained a network with TFRecord input pipeline. In other words, there was no placeholders. Simple example would be:
input, truth = _get_next_batch() # TFRecord. `input` is not a tf.placeholder
net = Model(input)
net.set_loss(truth)
optimizer = tf...(net.loss)
Let's say, I acquired three files, ckpt-20000.meta, ckpt-20000.data-0000-of-0001, ckpt-20000.index. I understood that, later one can import the meta-graph using the .meta file and access tensors such as:
new_saver = tf.train.import_meta_graph('ckpt-20000.meta')
new_saver.restore(sess, 'ckpt-20000')
logits = tf.get_collection("logits")[0]
However, the meta-graph does not have a placeholder from the beginning in the pipeline. Is there a way that I can use meta-graph and query inference of an input?
For information, in a query application (or a script), I used to define a model with a placeholder and restored model weights (see below). I am wondering if I can just utilize the meta-graph without re-definition since it would be much more simple.
input = tf.placeholder(...)
net = Model(input)
tf.restore(sess, 'ckpt-2000')
lgt = sess.run(net.logits, feed_dict = {input:img})
You can build a graph that uses placeholder_with_default() for the inputs, so can use both TFRecord input pipeline as well as feed_dict{}.
An example:
input, truth = _get_next_batch()
_x = tf.placeholder_with_default(input, shape=[...], name='input')
_y = tf.placeholder_with_default(truth, shape-[...], name='label')
net = Model(_x)
net.set_loss(_y)
optimizer = tf...(net.loss)
Then during inference,
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
new_saver = tf.train.import_meta_graph('ckpt-20000.meta')
new_saver.restore(sess, 'ckpt-20000')
# Get the tensors by their variable name
input = loaded_graph.get_tensor_by_name('input:0')
logits = loaded_graph.get_tensor_by_name(...)
# Now you can feed the inputs to your tensors
lgt = sess.run(logits, feed_dict = {input:img})
In the above example, if you don't feed input, then the input will be read from the TFRecord input pipeline.
Is there a way to do it without placeholders at test though? It should be possible to re-use the graph with a new input pipeline without resorting to slow placeholders (i.e. the test dataset may be very large). placeholder_with_default is a suboptimal solution in that case.
The recommended way is saving two meta graphs. One is for Training/Validation/Testing, and the other one is for inference.
see Building a SavedModel
export_dir = ...
...
builder = tf.saved_model_builder.SavedModelBuilder(export_dir)
with tf.Session(graph=tf.Graph()) as sess:
...
builder.add_meta_graph_and_variables(sess,
[tag_constants.TRAINING],
signature_def_map=foo_signatures,
assets_collection=foo_assets)
...
# Add a second MetaGraphDef for inference.
with tf.Session(graph=tf.Graph()) as sess:
...
builder.add_meta_graph([tag_constants.SERVING])
...
builder.save()
The NMT tutorial also provides a detailed example about creating multiple graphs with shared variables: Neural Machine Translation (seq2seq) Tutorial-Building Training, Eval, and Inference Graphs
train_graph = tf.Graph()
eval_graph = tf.Graph()
infer_graph = tf.Graph()
with train_graph.as_default():
train_iterator = ...
train_model = BuildTrainModel(train_iterator)
initializer = tf.global_variables_initializer()
with eval_graph.as_default():
eval_iterator = ...
eval_model = BuildEvalModel(eval_iterator)
with infer_graph.as_default():
infer_iterator, infer_inputs = ...
infer_model = BuildInferenceModel(infer_iterator)
checkpoints_path = "/tmp/model/checkpoints"
train_sess = tf.Session(graph=train_graph)
eval_sess = tf.Session(graph=eval_graph)
infer_sess = tf.Session(graph=infer_graph)
train_sess.run(initializer)
train_sess.run(train_iterator.initializer)
for i in itertools.count():
train_model.train(train_sess)
if i % EVAL_STEPS == 0:
checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=i)
eval_model.saver.restore(eval_sess, checkpoint_path)
eval_sess.run(eval_iterator.initializer)
while data_to_eval:
eval_model.eval(eval_sess)
if i % INFER_STEPS == 0:
checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=i)
infer_model.saver.restore(infer_sess, checkpoint_path)
infer_sess.run(infer_iterator.initializer, feed_dict={infer_inputs: infer_input_data})
while data_to_infer:
infer_model.infer(infer_sess)