How to use Keras Multiply() with tf.Variable? - tensorflow

How do I multiply tf.keras.layers with tf.Variable?
Context: I am creating a sample dependent convolutional filter, which consists of a generic filter W that is transformed through sample dependent shifting + scaling. Therefore, the convolutional original filter W is transformed into aW + b where a is sample dependent scaling and b is sample dependent shifting. One application of this is training an autoencoder where the sample dependency is the label, so each label shifts/scales the convolutional filter. Because of sample/label dependent convolutions, I am using tf.nn.conv2d which takes the actual filters as input (as opposed to just the number/size of filters) and a lambda layer with tf.map_fn to apply a different "transformed filter" (based on the label) for each sample. Although the details are different, this kind of sample-dependent convolution approach is discussed in this post: Tensorflow: Convolutions with different filter for each sample in the mini-batch.
Here is what I am thinking:
input_img = keras.Input(shape=(28, 28, 1))
label = keras.Input(shape=(10,)) # number of classes
num_filters = 32
shift = layers.Dense(num_filters, activation=None, name='shift')(label) # (32,)
scale = layers.Dense(num_filters, activation=None, name='scale')(label) # (32,)
# filter is of shape (filter_h, filter_w, input channels, output filters)
filter = tf.Variable(tf.ones((3,3,input_img.shape[-1],num_filters)))
# TODO: need to shift and scale -> shift*(filter) + scale along each output filter dimension (32 filter dimensions)
I am not sure how to implement the TODO part. I was thinking of tf.keras.layers.Multiply() for scaling and tf.keras.layers.Add() for shifting, but they do not seem to work with tf.Variable to my knowledge. How do I get around this? Assuming the dimensions/shape broadcasting work out, I would like to do something like this (note: the output should still be the same shape as var and is just scaled along each of the 32 output filter dimensions)
output = tf.keras.layers.Multiply()([var, scale])

It requires some work and needs a custom layer. For example you cannot use tf.Variable with tf.keras.Lambda
class ConvNorm(layers.Layer):
def __init__(self, height, width, n_filters):
super(ConvNorm, self).__init__()
self.height = height
self.width = width
self.n_filters = n_filters
def build(self, input_shape):
self.filter = self.add_weight(shape=(self.height, self.width, input_shape[-1], self.n_filters),
initializer='glorot_uniform',
trainable=True)
# TODO: Add bias too
def call(self, x, scale, shift):
shift_reshaped = tf.expand_dims(tf.expand_dims(shift,1),1)
scale_reshaped = tf.expand_dims(tf.expand_dims(scale,1),1)
norm_conv_out = tf.nn.conv2d(x, self.filter*scale + shift, strides=(1,1,1,1), padding='SAME')
return norm_conv_out
Using the layer
import tensorflow as tf
import tensorflow.keras.layers as layers
input_img = layers.Input(shape=(28, 28, 1))
label = layers.Input(shape=(10,)) # number of classes
num_filters = 32
shift = layers.Dense(num_filters, activation=None, name='shift')(label) # (32,)
scale = layers.Dense(num_filters, activation=None, name='scale')(label) # (32,)
conv_norm_out = ConvNorm(3,3,32)(input_img, scale, shift)
print(norm_conv_out.shape)
Note: Note that I haven't added bias. You will need bias as well for the convolution layer. But that's straightfoward.

Related

GradCAM (Class Activation Map) - tape.gradient() returns None for finetuned model

I have finetuned a model and I have saved all weights with name checkpoint_vgg16.h5. I want to apply gradCAM on my finetuned model (not pretrained VGG model) and if it required that using something for GradCAM while training phase, I didn't use anything.
Implementation details like that:
base_model = keras.applications.VGG16(include_top=False, input_shape=(128, 88, 3))
model1 = Sequential()
for layer in base_model.layers:
model1.add(layer)
model1.add(Flatten())
model1.add(Dense(4096, activation="relu"))
model1.add(Dense(124, activation="softmax"))
model1.load_weights("checkpoint_vgg16.h5")
model = Model(model1.input, model1.layers[-1].output)
GradCAM class:
class GradCAM:
def __init__(self, model, classIdx, layerName=None):
# store the model, the class index used to measure the class
# activation map, and the layer to be used when visualizing
# the class activation map
self.model = model
self.classIdx = classIdx
self.layerName = layerName
# if the layer name is None, attempt to automatically find
# the target output layer
if self.layerName is None:
self.layerName = self.find_target_layer()
def find_target_layer(self):
# attempt to find the final convolutional layer in the network
# by looping over the layers of the network in reverse order
for layer in reversed(self.model.layers):
# check to see if the layer has a 4D output
if len(layer.output_shape) == 4:
return layer.name
# otherwise, we could not find a 4D layer so the GradCAM
# algorithm cannot be applied
raise ValueError("Could not find 4D layer. Cannot apply GradCAM.")
def compute_heatmap(self, image, eps=1e-8):
# construct our gradient model by supplying (1) the inputs
# to our pre-trained model, (2) the output of the (presumably)
# final 4D layer in the network, and (3) the output of the
# softmax activations from the model
gradModel = Model(
inputs=[self.model.inputs],
outputs=[self.model.get_layer(self.layerName).output, self.model.output])
# record operations for automatic differentiation
with tf.GradientTape() as tape:
# cast the image tensor to a float-32 data type, pass the
# image through the gradient model, and grab the loss
# associated with the specific class index
inputs = tf.cast(image, tf.float32)
(convOutputs, predictions) = gradModel(inputs)
loss = predictions[:, tf.argmax(predictions[0])]
print(loss)
# use automatic differentiation to compute the gradients
grads = tape.gradient(loss, convOutputs)
print(grads)
# compute the guided gradients
castConvOutputs = tf.cast(convOutputs > 0, "float32")
castGrads = tf.cast(grads > 0, "float32")
guidedGrads = castConvOutputs * castGrads * grads
# the convolution and guided gradients have a batch dimension
# (which we don't need) so let's grab the volume itself and
# discard the batch
convOutputs = convOutputs[0]
guidedGrads = guidedGrads[0]
# compute the average of the gradient values, and using them
# as weights, compute the ponderation of the filters with
# respect to the weights
weights = tf.reduce_mean(guidedGrads, axis=(0, 1))
cam = tf.reduce_sum(tf.multiply(weights, convOutputs), axis=-1)
# grab the spatial dimensions of the input image and resize
# the output class activation map to match the input image
# dimensions
(w, h) = (image.shape[2], image.shape[1])
heatmap = cv2.resize(cam.numpy(), (w, h))
# normalize the heatmap such that all values lie in the range
# [0, 1], scale the resulting values to the range [0, 255],
# and then convert to an unsigned 8-bit integer
numer = heatmap - np.min(heatmap)
denom = (heatmap.max() - heatmap.min()) + eps
heatmap = numer / denom
heatmap = (heatmap * 255).astype("uint8")
# return the resulting heatmap to the calling function
return heatmap
def overlay_heatmap(self, heatmap, image, alpha=0.5,
colormap=cv2.COLORMAP_VIRIDIS):
# apply the supplied color map to the heatmap and then
# overlay the heatmap on the input image
heatmap = cv2.applyColorMap(heatmap, colormap)
output = cv2.addWeighted(image, alpha, heatmap, 1 - alpha, 0)
# return a 2-tuple of the color mapped heatmap and the output,
# overlaid image
return (heatmap, output)
For the prediction part:
image = cv2.imread('sample.jpg')
image = image.astype('float32') / 255
image = np.expand_dims(image, axis=0)
preds = model.predict(image)
i = np.argmax(preds[0])
icam = GradCAM(model, i, 'block5_conv3') #block5_conv3: the last conv block in my model
print(icam)
heatmap = icam.compute_heatmap(image)
heatmap = cv2.resize(heatmap, (128, 88))
image = cv2.imread('sample.jpg')
(heatmap, output) = icam.overlay_heatmap(heatmap, image, alpha=0.5)
fig, ax = plt.subplots(1, 3)
ax[0].imshow(heatmap)
ax[1].imshow(image)
ax[2].imshow(output)
However, I get following error:
castGrads = tf.cast(grads > 0, "float32") TypeError: '>' not supported
between instances of 'NoneType' and 'int'
The grads = tape.gradient(loss, convOutputs) code line which belongs to compute_heatmap method in GradCAM returns None.
How can I fix this error? I want to apply GradCAM on my model.
Ref. Grad-CAM class activation visualization
Also, this question is similar to mine but, solution didn't work for me.

Neural Network isn't learning anything meaningful using Triplet Loss

I am working on a triplet loss based model for this Kaggle competition.
Short Description- In this competition, we have been challenged to build an algorithm to identify individual whales in images by analyzing a database of containing more than 25,000 images, gathered from research institutions and public contributors.
https://www.kaggle.com/c/humpback-whale-identification?rvi=1
I have decided to use a Siamese network architecture and train it to give me encodings which I can then use to calculate the distance between two pictures of whales. If this distance is below a particular threshold the two pictures belong to the same whale and if this distance is greater then, they aren't the same whale.
This is the Triplet loss function(learnt it from Andrew's deeplearning specialization) I used but i also normalized the encoding's to make the loss function more interpretable(easier to determine margin and split point) across different models(if that makes sense).(First, tried it without the normalization and when it didnt work i tried normalizing.) I also have tried changing alpha(margin) and varied it from 0.2 to 0.6.
from tensorflow.nn import l2_normalize as norm_l2
def triplet_loss(y_true, y_pred, alpha = 0.3):
"""
Arguments:
y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
y_pred -- python list containing three objects:
anchor -- the encodings for the anchor images, of shape (None, 128)
positive -- the encodings for the positive images, of shape (None, 128)
negative -- the encodings for the negative images, of shape (None, 128)
Returns:
loss -- real number, value of the loss
"""
anchor, positive, negative = y_pred[0], y_pred[1], y_pred[2]
anchor, positive, negative = norm_l2(anchor), norm_l2(positive), norm_l2(negative)
# Step 1: Compute the (encoding) distance between the anchor and the positive
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)), axis = -1)
# Step 2: Compute the (encoding) distance between the anchor and the negative
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)), axis = -1)
# Step 3: subtract the two previous distances and add alpha.
basic_loss = tf.add(tf.subtract(pos_dist, neg_dist), alpha)
# Step 4: Take the maximum of basic_loss and 0.0. Sum over the training examples.
loss = tf.reduce_sum(tf.maximum(basic_loss, 0.0))
return loss
This is an example of one of the model architectures i tried out. I have tried using pretrained Facenet, ResNet, DenseNet and Xception till now. I have tried Freezing different numbers of layers in each.
R = tf.keras.applications.ResNet50(include_top=False, weights = 'imagenet', input_shape=(224,224,3))
lr = 0.0001
optimizer = Adam(learning_rate=lr)
R.compile(optimizer=optimizer, loss = triplet_loss)
for layer in R.layers[0:30]:
layer.trainable = False
em_Rmodel = Sequential([
R,
GlobalAveragePooling2D(),
#tf.keras.layers.GlobalMaxPooling2D(),
Dense(512, activation='relu'),
bn(),
Dense(256, activation = 'sigmoid'),
Dense(128, activation = 'sigmoid')
])
def make_tripletModel(model):
#I was manually changing the input shape to fit the default shape of pretrained networks
A = Input(shape = (224, 224, 3), name='anchor')
P = Input(shape = (224, 224, 3), name = 'anchorPositive')
N = Input(shape = (224, 224, 3), name = 'anchorNegative')
enc_A = model(A)
enc_P = model(P)
enc_N = model(N)
tripletModel = Model(inputs=[A, P, N], outputs=[enc_A, enc_P, enc_N])
return tripletModel
tripletModel = make_tripletModel(em_Rmodel)
I have been training using semi-hard triplets and have also been augmenting data properly to generate more training images.
This is the batch generator that i used for training. crop_batch is a function that crops images to show only the whale's tail, using which one can identify whales. It uses a DenseNet trained on more than 1000 images with whale tails and the bounding box surrounding it. Does the work sufficiently well.
def batch_generator_RN(batch_size = batch_size, ishape = (256, 256, 3), model_input_shape = (224, 224, 3)):
triplet_generator = get_triplets()
y_val = np.zeros((batch_size, 2, 1))
anchors = np.zeros((batch_size, ishape[0], ishape[1], ishape[2]))
positives = np.zeros((batch_size, ishape[0], ishape[1], ishape[2]))
negatives = np.zeros((batch_size, ishape[0], ishape[1], ishape[2]))
while True:
for i in range(batch_size):
anchors[i], positives[i], negatives[i] = next(triplet_generator)
anc = crop_batch(anchors, batch_size= batch_size, img_shape=model_input_shape)
pos = crop_batch(positives, batch_size= batch_size, img_shape=model_input_shape)
neg = crop_batch(negatives, batch_size= batch_size, img_shape=model_input_shape)
x_data = {'anchor': anc,
'anchorPositive': pos,
'anchorNegative': neg
}
yield (x_data, [y_val, y_val, y_val])
And finally, this, in general, is how i have been trying to train these models. I have tried reducing and increasing learning rate, batch_size = 16.
lr = 0.0001
optimizer = Adam(learning_rate=lr)
tripletModel.compile(optimizer = optimizer, loss = triplet_loss)
es = EarlyStopping(monitor='loss', patience=20, min_delta=0.05, restore_best_weights=True)
#mc = ModelCheckpoint('Rmodel.h5', monitor='loss', save_best_only=True, save_weights_only=True)
rlr = ReduceLROnPlateau(monitor='loss',min_delta=0.05,factor = 0.1,patience = 5, verbose = 1, min_lr = 0)
gen = batch_generator(batch_size)
tripletModel.fit(gen, steps_per_epoch=64, epochs = 40, callbacks=[es, rlr])
So after training all these models, in some models the triplet loss does go down for a while but then plateaus and basically learns nothing meaningful(which basically means that just by looking at the distance between two embeddings i cant figure out if they are the same whale or not.). In other models, immediately after the first or the second epoch the weights converge and don't change at all and doesn't learning anything.
I have tried a very wide range of learning rates and i am pretty sure that it isnt the problem.
Please tell me if i should add all the code files for you to understand the problem better. The reason i havent done it yet because i havent cleaned it but will gladly do so if required. Thanks.
When you say that it doesn't learn anything, is it that the loss reaches a plateau and thus it stops decreasing or it does decrease significantly but when you predict the embeddings of both same and different whales are are similar in value?
The triples_loss() fn and batch_generator_RN() fn are correct, the problem is not related to the data generation.
However, I suspect that your learning rate is too high while you freeze a lot of layers, i.e. numerous trainable parameters are frozen, which may lead to your network being unable to converge.
My suggestion is to unfreeze all the layers and decrease the learning rate to 0.00001 and start training again, regardless of the architecture that you use (Xception/ResNet etc.)

How to compute saliency map using keras backend

I am trying to construct a basic "vanilla gradient" saliency heatmap (gradient-based feature attribution) for MNIST using keras. I know there are libraries such as this one to compute saliency heatmaps, but I would like to construct this from scratch since the vanilla gradient approach seems conceptually straightforward to implement. I have trained the following digit classifier in Keras using functional model definition:
input = layers.Input(shape=(28,28,1), name='input')
conv2d_1 = layers.Conv2D(32, kernel_size=(3, 3), activation='relu')(input)
maxpooling2d_1 = layers.MaxPooling2D(pool_size=(2, 2), name='maxpooling2d_1')(conv2d_1)
conv2d_2 = layers.Conv2D(64, kernel_size=(3, 3), activation='relu')(maxpooling2d_1)
maxpooling2d_2 = layers.MaxPooling2D(pool_size=(2, 2))(conv2d_2)
flatten = layers.Flatten(name='flatten')(maxpooling2d_2)
dropout = layers.Dropout(0.5, name='dropout')(flatten)
dense = layers.Dense(num_classes, activation='softmax', name='dense')(dropout)
model = keras.models.Model(inputs=input, outputs=dense)
Now, I want to compute the saliency map for a single MNIST image. Since the final layer has a softmax activation and the denominator is a normalization term (so that the output nodes add up to 1), I believe that I need to either take the pre-softmax output or change the activation of the trained model linear for computing saliency maps. I will do the latter.
model.layers[-1].activation = tf.keras.activations.linear # swap activation to linear
input = loaded_model.layers[0].input
output = loaded_model.layers[-1].output
input_image = x_test[0] # shape is (28, 28, 1)
pred = np.argmax(loaded_model.predict(np.expand_dims(input_image, axis=0))) # predicted class
However, I am not sure what to do beyond this. I know I can use the following K.gradients(output, input) to compute gradients. That being said, I believe I should compute the gradient of the predicted class with respect to the input image, versus computing the gradient of the entire output. How would I do this? Also, I'm not sure how to evaluate the saliency heatmap for a specific image/prediction. I imagine I will have to use sess = tf.keras.backend.get_session() and sess.run(), but not sure exactly. I would greatly appreciate any help with completing the saliency heatmap code. Thanks!
If you add the activation as a single layer after the last dense layer with:
keras.layers.Activation('softmax')
you can do:
linear_model = keras.Model(input=model, output=model.layers[-2].output)
To then compute the gradients like:
def get_saliency_map(model, image, class_idx):
with tf.GradientTape() as tape:
tape.watch(image)
predictions = model(image)
loss = predictions[:, class_idx]
# Get the gradients of the loss w.r.t to the input image.
gradient = tape.gradient(loss, image)
# take maximum across channels
gradient = tf.reduce_max(gradient, axis=-1)
# convert to numpy
gradient = gradient.numpy()
# normaliz between 0 and 1
min_val, max_val = np.min(gradient), np.max(gradient)
smap = (gradient - min_val) / (max_val - min_val + keras.backend.epsilon())
return smap

a problem using LSTM network (neural networks)

im trying to create a speaker diarization system using lstm (im trying to make the network tell the difference between speakers).
this is the model i've created:
model = Sequential()
model.add(LSTM(768, batch_input_shape=(39, 40, 1), return_sequences=True))
model.add(Dense(256))
model.add(LSTM(768, return_sequences=True))
model.add(Dense(256))
model.add(LSTM(768, return_sequences=True))
model.add(Dense(4))
there are 4 different speakers.
in my dataset i have the array 'features' (256 at length for 256 speech segments).
for each segment in 'features' i have 39 vectors to represent each segment and each of these vectors is at size 40.
each of these 39 vectors is extracted from a different time window. (i used log mel filterbank energies).
i also have the array 'lables' which is also 256 at length and contains the lables for each segment.
i used 'to_categorical' for it:
labels = tf.keras.utils.to_categorical(labels, num_classes=4)
i tried using a generator to feed it to the network but it didnt work.
this is the class i used:
class KerasBatchGenerator(object):
def __init__(self, features, batch_size, labels):
self.features = features
self.batch_size = batch_size
self.labels = labels
def generate(self):
while True:
for i in self.labels:
for j in self.features:
temp = [j, i]
# temp = np.expand_dims(temp, axis=1)
temp = np.expand_dims(temp, axis=2)
yield tuple(temp)
and the code i used to run the network is:
train_data_generator = KerasBatchGenerator(features, batch_size, labels)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(train_data_generator.generate(), 100, 1)
please help!!!
If i guessed correctly, you want to classify which input in spoke by which speaker.
In that case your final layer should have a shape (batch_size, numOfClasses) or (39, 4)
But if you take close look at the summary the output shape for final layer is (39, 40, 4)
to get the proper shape remove the argument return_sequences=True from last LSTM layer.

What does `y_dim` represent in the following GANs model?

This question is about an implementation example of GANs using TensorFlow on images.
I've excerpted part of code that I thought was enough to provide a context, reference for full code. In following code, it has defined a discriminator function, which can be considered as a typical convolution neural network, loosely speaking. As it is seen, discriminator operations are conditioned on y_dim, could someone help explain what is y_dim? Looking at Args, I am still very confused about the definition of y_dim.
class DCGAN(object):
def __init__(self, sess, image_size=108, is_crop=True,
batch_size=64, sample_size=64, output_size=64,
y_dim=None, z_dim=100, gf_dim=64, df_dim=64,
gfc_dim=1024, dfc_dim=1024, c_dim=3, dataset_name='default',
checkpoint_dir=None, sample_dir=None):
"""
Args:
sess: TensorFlow session
batch_size: The size of batch. Should be specified before training.
output_size: (optional) The resolution in pixels of the images. [64]
y_dim: (optional) Dimension of dim for y. [None]
z_dim: (optional) Dimension of dim for Z. [100]
gf_dim: (optional) Dimension of gen filters in first conv layer. [64]
df_dim: (optional) Dimension of discrim filters in first conv layer. [64]
gfc_dim: (optional) Dimension of gen units for for fully connected layer. [1024]
dfc_dim: (optional) Dimension of discrim units for fully connected layer. [1024]
c_dim: (optional) Dimension of image color. For grayscale input, set to 1. [3]
"""
...
def discriminator(self, image, y=None, reuse=False):
if reuse:
tf.get_variable_scope().reuse_variables()
if not self.y_dim:
h0 = lrelu(conv2d(image, self.df_dim, name='d_h0_conv'))
h1 = lrelu(self.d_bn1(conv2d(h0, self.df_dim * 2, name='d_h1_conv')))
h2 = lrelu(self.d_bn2(conv2d(h1, self.df_dim * 4, name='d_h2_conv')))
h3 = lrelu(self.d_bn3(conv2d(h2, self.df_dim * 8, name='d_h3_conv')))
h4 = linear(tf.reshape(h3, [self.batch_size, -1]), 1, 'd_h3_lin')
return tf.nn.sigmoid(h4), h4
else:
yb = tf.reshape(y, [self.batch_size, 1, 1, self.y_dim])
x = conv_cond_concat(image, yb)
h0 = lrelu(conv2d(x, self.c_dim + self.y_dim, name='d_h0_conv'))
h0 = conv_cond_concat(h0, yb)
h1 = lrelu(self.d_bn1(conv2d(h0, self.df_dim + self.y_dim, name='d_h1_conv')))
h1 = tf.reshape(h1, [self.batch_size, -1])
h1 = tf.concat(1, [h1, y])
h2 = lrelu(self.d_bn2(linear(h1, self.dfc_dim, 'd_h2_lin')))
h2 = tf.concat(1, [h2, y])
h3 = linear(h2, 1, 'd_h3_lin')
return tf.nn.sigmoid(h3), h3
y_dim is the length of the labels.
It seems to be used in the DCGAN class to say 'Do we know the number of training labels'. If we do we can define all these tensors with the known dimensions.
E.g. Look at the main.py file where the MNIST dataset is used. See that y_dim is set to 10 for the 0-9 labels? It is 'None' for the second option.
Hope this helps.