Related
So, I have done some modifications with the VGG16 neural network to make a linear-regression and classification model.
And at last I am writing the following code to compile and fit my data into the model to initiate training->
class Facetracker(Model):
# The initialization function-
def __init__(self,eyetracker,**kwargs):
super().__init__(**kwargs)
self.model = eyetracker #Instantiating the model
def compile(self,opt,classlosss,localization_loss,**kwargs):
super().compile(**kwargs)
self.classloss = class_loss
self.localization_loss = regress_loss
self.opt = optimizer
# Defining the training step
def train_step(self,batch,**kwargs):
X,y = batch #unpacking our data
with tf.GradientTape() as tape:
classes,coords = self.model(X,training=True)
batch_classloss = self.classloss(y[0],classes)
batch_localloss = self.localization_loss(tf.cast(y[1],tf.float32),coords)
# calculating total loss-
total_loss = batch_localloss+0.5*batch_classloss
grad = tape.gradient(total_loss,self.model.trainable_variables)
optimizer.apply_gradients(zip(grad,self.model.trainable_variables))
return{
"total_loss":total_loss,
"class_loss":batch_classloss,
"localilzation_loss":batch_localloss
}
def test_step(self,batch):
X,y = batch
classes,coords = self.model(X,training=False)
batch_classloss = self.classloss(y[0],classes)
batch_localloss = self.localization_loss(tf.cast(y[1],tf.float32),coords)
total_loss = batch_localloss+0.5*batch_classloss
return{
"total_loss": total_loss,
"class_loss": batch_classloss,
"localilzation_loss": batch_localloss
}
# def call(self, X, **kwargs):
# return self.model(X,**kwargs)
# Replacing the call function with a lambda function
lambda self,X,**kwargs: self.model(X,**kwargs)
# Subclassing our model-
print("Subclassing.....")
model = Facetracker(facetracker)
print("Compiling......")
model.compile(optimizer,classlosss=class_loss,localization_loss=localization_loss)
# Preparing the log directory
logdir="logdir"
tensorboard_callbacks = tf.keras.callbacks.TensorBoard(log_dir=logdir)
print("Fitting the model")
hist = model.fit(train.take(80),
epochs=16,
initial_epoch =8,
validation_data=val,
validation_steps =8,
validation_freq=2,
callbacks = [[tensorboard_callbacks]])
This includes the class prepared for training the model and the last few lines are subclassing the model and fitting the prepared data into the model.
The error I now get seems pretty hefty to me and it goes like this->
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\main.py", line 535, in <module>
hist = model.fit(train.take(80),
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\tensorflow\python\eager\execute.py", line 54, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:
Detected at node 'gradient_tape/sub_2/BroadcastGradientArgs' defined at (most recent call last):
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\main.py", line 535, in <module>
hist = model.fit(train.take(80),
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\keras\engine\training.py", line 1409, in fit
tmp_logs = self.train_function(iterator)
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\keras\engine\training.py", line 1051, in train_function
return step_function(self, iterator)
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\keras\engine\training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\MarkATT\lib\site-packages\keras\engine\training.py", line 1030, in run_step
outputs = model.train_step(data)
File "C:\Users\Radhe Krishna\OneDrive\Documents\MarkATT\main.py", line 498, in train_step
grad = tape.gradient(total_loss,self.model.trainable_variables)
Node: 'gradient_tape/sub_2/BroadcastGradientArgs'
Incompatible shapes: [2,4] vs. [8]
[[{{node gradient_tape/sub_2/BroadcastGradientArgs}}]] [Op:__inference_train_function_22026]
It would be a great help if someone could examine this problem and help me get out of it. I have been quite struggling with it now.
Thanks in advance!!!
I has planned to initiate training with the code provided, but I firstly received an error for the call function and as I resolved it by converting it into lambda function- This issue of incompatible shapes came up. I tries to adjust the integers in the input data and batch related fields but nothing worked.
I'm trying to integrate Optuna with DeepSpeech in order to optimise some of its hyperparameters. I'm sticking to learning rate for now, just to get a feel for how Optuna works, but I've hit a roadblock and need some help.
I have a function hps_train which is what does the training step. It takes the Optuna trial object as the argument and returns the dev loss, which is what I want to use Optuna to minimise. This is the exact same function as train() in training/deepspeech_training/train.py, but with a few modifications:
def hps_train(trial):
#.
#.Same as train() in https://github.com/mozilla/DeepSpeech/blob/master/training/deepspeech_training/train.py
#.
if FLAGS.horovod:
# Effective batch size in synchronous distributed training is scaled by the number of workers. An increase in learning rate compensates for the increased batch size.
optimizer = hps_create_optimizer(learning_rate_var * hvd.size())
optimizer = hvd.DistributedOptimizer(optimizer)
else:
optimizer, learning_rate_var = hps_create_optimizer(trial)
reduce_learning_rate_op = learning_rate_var.assign(
tf.multiply(learning_rate_var, FLAGS.plateau_reduction)
)
#.
#.Same as train() https://github.com/mozilla/DeepSpeech/blob/master/training/deepspeech_training/train.py
#.
with tfv1.Session(config=Config.session_config) as session:
#.
#.Same as train() https://github.com/mozilla/DeepSpeech/blob/master/training/deepspeech_training/train.py
#.
final_dev_loss = dev_losses[-1]
log_debug("Session closed.")
return final_dev_loss
I also have some helper functions:
def hps_create_optimizer(trial):
learning_rate = trial.suggest_float("adam_lr", 1e-5, 1e-1, log=True)
with tf.variable_scope("learning_rate", reuse=tf.AUTO_REUSE):
learning_rate_var = tfv1.get_variable(
"learning_rate", initializer=learning_rate, trainable=False
)
optimizer = tfv1.train.AdamOptimizer(
learning_rate=learning_rate_var, beta1=0.9, beta2=0.999, epsilon=1e-08
)
return optimizer, learning_rate_var
def new_trial_callback(study, trial):
chkpt_path = setup_dirs(study.study_name, trial.number + 1)
FLAGS.checkpoint_dir = chkpt_path
FLAGS.save_checkpoint_dir = chkpt_path
FLAGS.load_checkpoint_dir = chkpt_path
def objective(trial, session):
if FLAGS.train_files:
val_loss = hps_train(trial, session)
return float(val_loss)
def objective_tf(trial):
tfv1.reset_default_graph()
with tfv1.Graph().as_default():
return objective(trial, session)
Putting it all together:
def main(_):
initialize_globals()
early_training_checks()
lr_study = optuna.create_study(study_name="lr_study", direction='minimize')
chkpt_dir = setup_dirs(lr_study.study_name, 0)
FLAGS.checkpoint_dir = chkpt_dir
FLAGS.save_checkpoint_dir = chkpt_dir
FLAGS.load_checkpoint_dir = chkpt_dir
lr_study.optimize(objective_tf, n_trials=25, callbacks=[new_trial_callback])
When I run this code, the first run completes normally. However, when it tries to start the second one, I get an error:
$ python training/hparam_search.py --train_files ~/datasets/cv-corpus-1/en/clips/train.csv --dev_files ~/datasets/cv-corpus-1/en/clips/dev.csv --test_files ~/datasets/cv-corpus-1/en/clips/test.csv --train_batch_size 64 --test_batch_size 64 --dev_batch_size 64 --n_hidden 512 --epochs 1 --train_cudnn --use_allow_growth --checkpoint_dir checkpoints
[I 2021-08-30 15:06:16,637] A new study created in memory with name: lr_study
I Could not find best validating checkpoint.
I Could not find most recent checkpoint.
I Initializing all variables.
I STARTING Optimization
Epoch 0 | Training | Elapsed Time: 0:00:17 | Steps: 187 | Loss: 252.374135
Epoch 0 | Validation | Elapsed Time: 0:00:12 | Steps: 109 | Loss: 255.176724 | Dataset: /home/user/datasets/cv-corpus-1/en/clips/dev.csv
I Saved new best validating model with loss 255.176724 to: checkpoints/optuna_trials/lr_study/0/best_dev-187
--------------------------------------------------------------------------------
I FINISHED optimization in 0:00:30.553797
[I 2021-08-30 15:06:50,101] Trial 0 finished with value: 255.1767243551552 and parameters: {'adam_lr': 0.006636434104761772}. Best is trial 0 with value: 255.1767243551552.
[W 2021-08-30 15:06:50,229] Trial 1 failed because of the following error: ValueError('in converted code:\n relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:\n\n contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call\n training)\n contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward\n seed=self._seed)\n contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn\n outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)\n python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3\n time_major=time_major, name=name)\n python/framework/op_def_library.py:367 _apply_op_helper\n g = ops._get_graph_from_inputs(_Flatten(keywords.values()))\n python/framework/ops.py:5979 _get_graph_from_inputs\n _assert_same_graph(original_graph_element, graph_element)\n python/framework/ops.py:5914 _assert_same_graph\n (item, original_item))\n\n ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).\n',)
Traceback (most recent call last):
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "training/hparam_search.py", line 671, in objective_tf
return objective(trial)
File "training/hparam_search.py", line 660, in objective
val_loss = hps_train(trial)
File "training/hparam_search.py", line 332, in hps_train
iterator, optimizer, dropout_rates
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 195, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 133, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call
training)
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward
seed=self._seed)
contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3
time_major=time_major, name=name)
python/framework/op_def_library.py:367 _apply_op_helper
g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
python/framework/ops.py:5979 _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
python/framework/ops.py:5914 _assert_same_graph
(item, original_item))
ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).
Traceback (most recent call last):
File "training/hparam_search.py", line 691, in <module>
absl.app.run(main)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "training/hparam_search.py", line 684, in main
lr_study.optimize(objective_tf, n_trials=25, callbacks=[new_trial_callback])
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/study.py", line 409, in optimize
show_progress_bar=show_progress_bar,
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 76, in _optimize
progress_bar=progress_bar,
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 163, in _optimize_sequential
trial = _run_trial(study, func, catch)
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 264, in _run_trial
raise func_err
File "/home/user/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
value_or_values = func(trial)
File "training/hparam_search.py", line 671, in objective_tf
return objective(trial)
File "training/hparam_search.py", line 660, in objective
val_loss = hps_train(trial)
File "training/hparam_search.py", line 332, in hps_train
iterator, optimizer, dropout_rates
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 317, in get_tower_results
avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(iterator, dropout_rates, reuse=i > 0)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 244, in calculate_mean_edit_distance_and_loss
logits, _ = create_model(batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 195, in create_model
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
File "/home/user/DeepSpeech/training/deepspeech_training/train.py", line 133, in rnn_impl_cudnn_rnn
sequence_lengths=seq_length)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
relative to /usr/local/lib/python3.6/dist-packages/tensorflow_core:
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:440 call
training)
contrib/cudnn_rnn/python/layers/cudnn_rnn.py:518 _forward
seed=self._seed)
contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py:1132 _cudnn_rnn
outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
python/ops/gen_cudnn_rnn_ops.py:2051 cudnn_rnnv3
time_major=time_major, name=name)
python/framework/op_def_library.py:367 _apply_op_helper
g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
python/framework/ops.py:5979 _get_graph_from_inputs
_assert_same_graph(original_graph_element, graph_element)
python/framework/ops.py:5914 _assert_same_graph
(item, original_item))
ValueError: Tensor("cudnn_lstm/opaque_kernel:0", dtype=float32_ref, device=/device:GPU:0) must be from the same graph as Tensor("tower_0/Reshape_2:0", shape=(?, ?, 512), dtype=float32, device=/device:GPU:0).
It looks like the ValueError is complaining that some tensor is not from the same graph as another. But I don't understand how this can be, since I start each run within a new Graph context, so every tensor should be associated with this new graph.
Optuna version is 2.9.1 and Tensorflow version is 1.15.4
I'd be grateful for any insights into where I'm going wrong here, or even if this is the recommended way to use Optuna. Thanks very much!
All latest versions from the very moment of this post.
tensorflow-gpu: 2.6.0
Python: 3.9.7
CUDA: 11.4.2
cuDNN: 8.2.4
As in the code below, when loading a model that was normalized by not passing arguments to Normalization() it throws an exception when that model is loaded by load_model(), however before loading the model I can use it without any apparent issues which makes you think it's all good since Normalization() did NOT complain and took care of the input shape. When loading a model that was normalized by Normalization(input_dim=5) it does NOT thrown any exception since a known shape is specified. That is weird I mean it should warn you that when normalizing it without passing arguments to Normalization() you should expect an exception when loading it.
I'm not sure if it's a bug so I'm posting it here before reporting a bug in the github section, maybe I'm missing to setup something.
Here's my code:
import numpy as np
import tensorflow as tf
def main():
train_data = np.array([[1, 2, 3, 4, 5]])
train_label = np.array([123])
# Uncomment this to load the model and comment the next model and normalizer related lines.
#model = tf.keras.models.load_model('AI/test.h5')
normalizer = tf.keras.layers.experimental.preprocessing.Normalization()
normalizer.adapt(train_data)
model = tf.keras.Sequential([normalizer, tf.keras.layers.Dense(units=1)])
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1), loss='mean_absolute_error')
model.fit(train_data, train_label, epochs=3000)
model.save('AI/test.h5')
unseen_data = np.array([[1, 2, 3, 4, 6]])
prediction = model.predict(unseen_data)
print(prediction)
if __name__ == "__main__":
main()
It throws the following exception:
Traceback (most recent call last):
File "E:\Backup\Desktop\tensorflow_test.py", line 30, in <module>
main()
File "E:\Backup\Desktop\tensorflow_test.py", line 11, in main
model = tf.keras.models.load_model('AI/test.h5')
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\saving\save.py", line 200, in load_model
return hdf5_format.load_model_from_hdf5(filepath, custom_objects,
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\saving\hdf5_format.py", line 180, in load_model_from_hdf5
model = model_config_lib.model_from_config(model_config,
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\saving\model_config.py", line 52, in model_from_config
return deserialize(config, custom_objects=custom_objects)
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\layers\serialization.py", line 208, in deserialize
return generic_utils.deserialize_keras_object(
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\generic_utils.py", line 674, in deserialize_keras_object
deserialized_obj = cls.from_config(
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\sequential.py", line 434, in from_config
model.add(layer)
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\tensorflow\python\training\tracking\base.py", line 530, in _method_wrapper
result = method(self, *args, **kwargs)
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\sequential.py", line 217, in add
output_tensor = layer(self.outputs[0])
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 976, in __call__
return self._functional_construction_call(inputs, args, kwargs,
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1114, in _functional_construction_call
outputs = self._keras_tensor_symbolic_call(
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 848, in _keras_tensor_symbolic_call
return self._infer_output_signature(inputs, args, kwargs, input_masks)
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 886, in _infer_output_signature
self._maybe_build(inputs)
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 2659, in _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
File "C:\Users\censored\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\layers\preprocessing\normalization.py", line 145, in build
raise ValueError(
ValueError: All `axis` values to be kept must have known shape. Got axis: (-1,), input shape: [None, None], with unknown axis at index: 1
Process finished with exit code 1
It looks like a bug.
Follow this link
if 'input_dim' in kwargs and 'input_shape' not in kwargs:
# Backwards compatibility: alias 'input_dim' to 'input_shape'.
kwargs['input_shape'] = (kwargs['input_dim'],)
if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
# In this case we will later create an input layer
# to insert before the current layer
if 'batch_input_shape' in kwargs:
batch_input_shape = tuple(kwargs['batch_input_shape'])
elif 'input_shape' in kwargs:
if 'batch_size' in kwargs:
batch_size = kwargs['batch_size']
else:
batch_size = None
batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
self._batch_input_shape = batch_input_shape
The error occurs because the normalization could not get any shape information which would lead to self._input_batch_shape =(None, None).
But when loading model(deserialization), It would call build function which should have known shape in all axes.
# Sorted to avoid transposing axes.
self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])
# All axes to be kept should have known shape.
for d in self._keep_axis:
if input_shape[d] is None:
raise ValueError(
'All `axis` values to be kept must have known shape. Got axis: {}, '
'input shape: {}, with unknown axis at index: {}'.format(
self.axis, input_shape, d))
I am trying to use pre-trained resnet and fine-tune it using triplet loss. The following code I came up with is a combination of tutorials I found on the topic:
import pathlib
import tensorflow as tf
import tensorflow_addons as tfa
with tf.device('/cpu:0'):
INPUT_SHAPE = (32, 32, 3)
BATCH_SIZE = 16
data_dir = pathlib.Path('/home/user/dataset/')
base_model = tf.keras.applications.ResNet50V2(
weights='imagenet',
pooling='avg',
include_top=False,
input_shape=INPUT_SHAPE,
)
# following two lines are added after edit, originally it was model = base_model
head_model = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(base_model.output)
model = tf.keras.Model(inputs=base_model.input, outputs=head_model)
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
rotation_range=10,
zoom_range=0.1,
)
generator = datagen.flow_from_directory(
data_dir,
target_size=INPUT_SHAPE[:2],
batch_size=BATCH_SIZE,
seed=42,
)
model.compile(
optimizer=tf.keras.optimizers.Adam(0.001),
loss=tfa.losses.TripletSemiHardLoss(),
)
model.fit(
generator,
epochs=5,
)
Unfortunately after running the code I get the following error:
Found 4857 images belonging to 83 classes.
Epoch 1/5
Traceback (most recent call last):
File "ReID/external_process.py", line 35, in <module>
model.fit(
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1098, in fit
tmp_logs = train_function(iterator)
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
result = self._call(*args, **kwds)
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 840, in _call
return self._stateless_fn(*args, **kwds)
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 2829, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 1843, in _filtered_call
return self._call_flat(
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 1923, in _call_flat
return self._build_call_outputs(self._inference_function.call(
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 545, in call
outputs = execute.execute(
File "/home/user/videolytics/venv_python/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 59, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Input to reshape is a tensor with 1328 values, but the requested shape has 16
[[{{node TripletSemiHardLoss/PartitionedCall/Reshape}}]] [Op:__inference_train_function_13749]
Function call stack:
train_function
2020-10-23 22:07:09.094736: W tensorflow/core/kernels/data/generator_dataset_op.cc:103] Error occurred when finalizing GeneratorDataset iterator: Failed precondition: Python interpreter state is not initialized. The process may be terminated.
[[{{node PyFunc}}]]
The dataset directory has 83 subdirectories, one per class and each of this subdirectories contains images of given class. The dimension 1328 in the error output is the batch size (16) times number of classes (83), and the dimension 16 is the batch size (both dimensions change accordingly if I change the BATCH_SIZE.
To be honest I do not really understand the error, so any solution or even any kind of indight where is the problem is deeply appreciated.
The problem is that the TripletSemiHardLoss expects
labels y_true to be provided as 1-D integer Tensor with shape [batch_size] of multi-class integer labels
but the flow_from_directory by default generate categorical labels; using class_mode="sparse" should fix the problem.
I'm trying to build a simple model and save the untrained layers. (I'll later want to train it). I'm trying to use tensorflow core API's without relying on Keras layers so I can more directly control what I use and maximize compatibility with TFLite.
import numpy as np
import tensorflow as tf
class BasicModel(tf.Module):
def __init__(self):
self.const = None
#tf.function(input_signature=[
tf.TensorSpec(shape=[None,20],dtype=tf.int32),
])
def rnn(self, captions):
# ENCODER
weights = tf.Variable(tf.random.normal([10000, 724]))#, shape=[vocab_size,embedding_dimension], name="embedding_weights")
embedding_output = tf.nn.embedding_lookup(weights,captions)
#activation is tanh for GRUCell
sequence = tf.unstack(embedding_output,num=20, axis=1)
cell = tf.compat.v1.nn.rnn_cell.GRUCell(20)
print(sequence)
gru_layer = tf.compat.v1.nn.static_rnn(cell, sequence, dtype=tf.float32)
return gru_layer
root = BasicModel()
concrete_function = root.rnn.get_concrete_function()
tf.saved_model.save(root,"model",concrete_function)
I expect to have an untrained model that saves but instead I get an error:
Traceback (most recent call last):
File "model_tensorflow_2.py", line 24, in <module>
concrete_function = root.rnn.get_concrete_function()#tf.constant(images), tf.constant(captions), tf.constant(cap_lens))
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 782, in get_concrete_function
return self._stateless_fn.get_concrete_function(*args, **kwargs)
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 1891, in get_concrete_function
graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2150, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2041, in _create_graph_function
capture_by_value=self._capture_by_value),
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 915, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 358, in wrapped_fn
return weak_wrapped_fn().__wrapped__(*args, **kwds)
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2658, in bound_method_wrapper
return wrapped_fn(*args, **kwargs)
File "/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 905, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
model_tensorflow_2.py:13 rnn *
weights = tf.Variable(tf.random.normal([10000, 724]))#, shape=[vocab_size,embedding_dimension], name="embedding_weights")
/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py:260 __call__
return cls._variable_v2_call(*args, **kwargs)
/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py:254 _variable_v2_call
shape=shape)
/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py:65 getter
return captured_getter(captured_previous, **kwargs)
/Users/t.capes/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py:413 invalid_creator_scope
"tf.function-decorated function tried to create "
ValueError: tf.function-decorated function tried to create variables on non-first call.
tf.function does not allow to create variables on non-first call, because the semantics of that are not clear: should the variables be re-created on each call? should they be implicitly cached? (see this bit in the "tf.function and AutoGraph" talk from TF Summit 2019).
A common workaround is to have a helper function creating the variables and ensure that it's called at most once for each instance:
class BasicModel(tf.Module):
# ...
def _create_parameters(self, ...):
self._weights = tf.Variable(...)
self._parameters_created = True
def rnn(self, ...):
if not self._parameters_created:
self._create_parameters(...)
...