I need to set a breakpoint to a old model in Keras:
import tensorflow as tf
inputs = tf.keras.Input(shape=(3,))
x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
x1 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x1)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
The actual model is a lot complicated and I am just providing a snippet. Is there a way for me to set a breakpoint in the forward pass? Just trying to see the intermediate model output.

It might depend a bit on your actual setting but you could split your model via its layers - similar like you set up an autoencoder.
And forward pass through the backbone, look at it -> pass through the head -> output.
import tensorflow as tf
inputs = tf.keras.Input(shape=(3,))
x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
x1 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x1)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
back = tf.keras.Sequential(model.layers[:2])
head = tf.keras.Sequential(model.layers[2:])
# Instead of doing model(input) you can now do
inter = back(input)
result = head(inter)
Alternatively you could also define multiple outputs, which are a bit uglier to train but for testing purposes you can pull the trained weights to this cloned model
inputs = tf.keras.Input(shape=(3,))
x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
x1 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x1)
model = tf.keras.Model(inputs=inputs, outputs=[outputs, x1]) #<-- adding your intermediate layer as a second output


" ValueError: Expecting KerasTensor which is from tf.keras.Input()". Error in prediction with dropout function

I am trying to predict uncertainty in a regression problem using Dropout during testing as per Yarin Gal's article. I created a class using Keras's backend function as provided by this stack overflow question's answer. The class takes a NN model as input and randomly drops neurons during testing to give a stochastic estimate rather than deterministic output for a time-series forecasting.
I create a simple encoder-decoder model as shown below for the forecasting with 0.1 dropout during training:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec) #maybe use dropout=0.1
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
# optimiser = optimizers.Adam(clipnorm=1)
enc_dec_model = Model(input_sequence, output)
After that, I define and call the DropoutPrediction class.
# Define the class:
class KerasDropoutPrediction(object):
def __init__(self ,model):
self.f = K.function(
def predict(self ,x, n_iter=10):
result = []
for _ in range(n_iter):
result.append(self.f([x , 1]))
result = np.array(result).reshape(n_iter ,x.shape[0] ,x.shape[1]).T
return result
# Call the object:
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(x_test,n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=1)
However, in the line
kdp = KerasDropoutPrediction(enc_dec_model), when I call the LSTM model,
I got the following error message which says the input has to be a Keras Tensor. Can anyone help me with this error?
Error Message:
ValueError: Found unexpected instance while processing input tensors for keras functional model. Expecting KerasTensor which is from tf.keras.Input() or output from keras layer call(). Got: 0
To activate Dropout at inference time, you simply have to specify training=True (TF>2.0) in the layer of interest (in the last LSTM layer in your case)
with training=False
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=False)
m = Model(inp,x)
# m.compile(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always the same
with training=True
inp = Input(shape=(10, 1))
x = LSTM(1, dropout=0.3)(inp, training=True)
m = Model(inp,x)
# m.compile(...)
X = np.random.uniform(0,1, (1,10,1))
output = []
for i in range(0,100):
output.append(m.predict(X)) # always different
In your example, this becomes:
input_sequence = Input(shape=(lookback, train_x.shape[2]))
encoder = LSTM(128, return_sequences=False)(input_sequence)
r_vec = RepeatVector(forward_pred)(encoder)
decoder = LSTM(128, return_sequences=True, dropout=0.1)(r_vec, training=True)
output = TimeDistributed(Dense(train_y.shape[2], activation='linear'))(decoder)
enc_dec_model = Model(input_sequence, output)
), train_y, epochs=10, batch_size=32)
and the KerasDropoutPrediction:
class KerasDropoutPrediction(object):
def __init__(self, model):
self.model = model
def predict(self, X, n_iter=10):
result = []
for _ in range(n_iter):
result = np.array(result)
return result
kdp = KerasDropoutPrediction(enc_dec_model)
y_pred_do = kdp.predict(test_x, n_iter=100)
y_pred_do_mean = y_pred_do.mean(axis=0)

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
def _init_weights(m):
if type(m) == nn.Linear:
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.Linear(in_features=1024, out_features=512),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels =,
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history =, output_numpy,
batch_size=32, epochs=100,validation_split = 0.2)
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

How to merge ReLU after quantization aware training

I have a network which contains Conv2D layers followed by ReLU activations, declared as such:
x = layers.Conv2D(self.hparams['channels_count'], kernel_size=(4,1))(x)
x = layers.ReLU()(x)
And it is ported to TFLite with the following representaiton:
Basic TFLite network without Q-aware training
However, after performing quantization-aware training on the network and porting it again, the ReLU layers are now explicit in the graph:
TFLite network after Q-aware training
This results in them being processed separately on the target instead of during the evaluation of the Conv2D kernel, inducing a 10% performance loss in my overall network.
Declaring the activation with the following implicit syntax does not produce the problem:
x = layers.Conv2D(self.hparams['channels_count'], kernel_size=(4,1), activation='relu')(x)
Basic TFLite network with implicit ReLU activation
TFLite network with implicit ReLU after Q-aware training
However, this restricts the network to basic ReLU activation, whereas I would like to use ReLU6 which cannot be declared in this way.
Is this a TFLite issue? If not, is there a way to prevent the ReLU layer from being split? Or alternatively, is there a way to manually merge the ReLU layers back into the Conv2D layers after the quantization-aware training?
QA training code:
def learn_qaware(self):
quantize_model = tfmot.quantization.keras.quantize_model
self.model = quantize_model(self.model)
training_generator = SCDataGenerator(self.training_set)
validate_generator = SCDataGenerator(self.validate_set)
epochs = self.hparams['max_epochs'],
batch_size = 1,
shuffle = self.hparams['shuffle_curves'],
validation_data = validate_generator,
callbacks = self.get_callbacks(qa_learn=True),
Quantized TFLite model generation code:
def tflite_convert(classifier):
output_file = get_tflite_filename(classifier.model_path)
# Convert the model to the TensorFlow Lite format without quantization
saved_shape = classifier.model.input.shape.as_list()
fixed_shape = saved_shape
fixed_shape[0] = 1
classifier.model.input.set_shape(fixed_shape) # Force batch size to 1 for generation
converter = tf.lite.TFLiteConverter.from_keras_model(classifier.model)
# Set the optimization flag.
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# Enforce integer only quantization
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
# Provide a representative dataset to ensure we quantize correctly.
if config['eager_mode']:
def representative_dataset():
for x in classifier.validate_set.get_all_inputs():
rs = x.reshape(1, x.shape[0], 1, 1).astype(np.float32)
converter.representative_dataset = representative_dataset
model_tflite = converter.convert()
# Save the model to disk
open(output_file, "wb").write(model_tflite)
return TFLite_model(output_file)
I have found a workaround which works by instantiating a non-trained version of the model, then copying over the weights from the quantization aware trained model before converting to TFLite.
This seems like quite a hack, so I'm still on the lookout for a cleaner solution.
Code for the workaround:
def dequantize(self):
if not hasattr(self, 'fp_model') or not self.fp_model:
self.fp_model = self.get_default_model()
def find_layer_in_model(name, model):
for layer in model.layers:
if == name:
return layer
return None
def find_weight_group_in_layer(name, layer):
for weight_group in quant_layer.trainable_weights:
if == name:
return weight_group
return None
for layer in self.fp_model.layers:
if 'input' in or 'quantize_layer' in
QUANT_TAG = "quant_"
quant_layer = find_layer_in_model(,self.model)
if quant_layer is None:
raise RuntimeError('Failed to match layer ' +
for i, weight_group in enumerate(layer.trainable_weights):
quant_weight_group = find_weight_group_in_layer(, quant_layer)
if quant_weight_group is None:
quant_weight_group = find_weight_group_in_layer(, quant_layer)
if quant_weight_group is None:
raise RuntimeError('Failed to match weight group ' +
self.model = self.fp_model
You can pass activation=tf.nn.relu6 to use ReLU6 activation.

How to apply Triplet Loss for a ResNet50 based Siamese Network in Keras or Tf 2

I have a ResNet based siamese network which uses the idea that you try to minimize the l-2 distance between 2 images and then apply a sigmoid so that it gives you {0:'same',1:'different'} output and based on how far the prediction is, you just flow the gradients back to network but there is a problem that updation of gradients is too little as we're changing the distance between {0,1} so I thought of using the same architecture but based on Triplet Loss.
I1 = Input(shape=image_shape)
I2 = Input(shape=image_shape)
res_m_1 = ResNet50(include_top=False, weights='imagenet', input_tensor=I1, pooling='avg')
res_m_2 = ResNet50(include_top=False, weights='imagenet', input_tensor=I2, pooling='avg')
x1 = res_m_1.output
x2 = res_m_2.output
# x = Flatten()(x) or use this one if not using any pooling layer
distance = Lambda( lambda tensors : K.abs( tensors[0] - tensors[1] )) ([x1,x2] )
final_output = Dense(1,activation='sigmoid')(distance)
siamese_model = Model(inputs=[I1,I2], outputs=final_output)
So how can I change it to use the Triplet Loss function? What adjustments should be done here in order to get this done? One change will be that I'll have to calculate
res_m_3 = ResNet50(include_top=False, weights='imagenet', input_tensor=I2, pooling='avg')
x3 = res_m_3.output
One thing found in tf docs is triplet-semi-hard-loss and is given as:
As shown in the paper, the best results are from triplets known as "Semi-Hard". These are defined as triplets where the negative is farther from the anchor than the positive, but still produces a positive loss. To efficiently find these triplets we utilize online learning and only train from the Semi-Hard examples in each batch.
Another implementation of Triplet Loss which I found on Kaggle is: Triplet Loss Keras
Which one should I use and most importantly, HOW?
P.S: People also use something like: x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x) after model.output. Why is that? What is this doing?
Following this answer of mine, and with role of TripletSemiHardLoss in mind, we could do following:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
from tensorflow.keras import models, layers
def _normalize_img(img, label):
img = tf.cast(img, tf.float32) / 255.
return (img, label)
train_dataset, test_dataset = tfds.load(name="mnist", split=['train', 'test'], as_supervised=True)
# Build your input pipelines
train_dataset = train_dataset.shuffle(1024).batch(BATCH_SIZE)
train_dataset =
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset =
inputs = layers.Input(shape=(28, 28, 1))
resNet50 = tf.keras.applications.ResNet50(include_top=False, weights=None, input_tensor=inputs, pooling='avg')
outputs = layers.Dense(LATENT_DEM, activation=None)(resNet50.output) # No activation on final dense layer
outputs = layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(outputs) # L2 normalize embedding
siamese_model = models.Model(inputs=inputs, outputs=outputs)
# Compile the model
# Train the network
history =

How to use TimeDistributed layer for predicting sequences of dynamic length? PYTHON 3

So I am trying to build an LSTM based autoencoder, which I want to use for the time series data. These are spitted up to sequences of different lengths. Input to the model has thus shape [None, None, n_features], where the first None stands for number of samples and the second for time_steps of the sequence. The sequences are processed by LSTM with argument return_sequences = False, coded dimension is then recreated by function RepeatVector and ran through LSTM again. In the end I would like to use the TimeDistributed layer, but how to tell python that the time_steps dimension is dynamic? See my code:
from keras import backend as K
.... other dependencies .....
input_ae = Input(shape=(None, 2)) # shape: time_steps, n_features
LSTM1 = LSTM(units=128, return_sequences=False)(input_ae)
code = RepeatVector(n=K.shape(input_ae)[1])(LSTM1) # bottleneck layer
LSTM2 = LSTM(units=128, return_sequences=True)(code)
output = TimeDistributed(Dense(units=2))(LSTM2) # ??????? HOW TO ????
# no problem here so far:
model = Model(input_ae, outputs=output)
model.compile(optimizer='adam', loss='mse')
this function seems to do the trick
def repeat(x_inp):
x, inp = x_inp
x = tf.expand_dims(x, 1)
x = tf.repeat(x, [tf.shape(inp)[1]], axis=1)
return x
input_ae = Input(shape=(None, 2))
LSTM1 = LSTM(units=128, return_sequences=False)(input_ae)
code = Lambda(repeat)([LSTM1, input_ae])
LSTM2 = LSTM(units=128, return_sequences=True)(code)
output = TimeDistributed(Dense(units=2))(LSTM2)
model = Model(input_ae, output)
model.compile(optimizer='adam', loss='mse')
X = np.random.uniform(0,1, (100,30,2)), X, epochs=5)
I'm using tf.keras with TF 2.2