multi gpu mixed_precision in Tensorflow yields NaN prediction value - tensorflow

def get_model():
with STRATEGY.scope():
# Set seed for deterministic weights initialization
seed_everything()
# Inputs, note the names are equal to the dictionary keys in the dataset
image = tf.keras.layers.Input(INPUT_SHAPE, name='image', dtype=tf.uint8)
image_norm = normalize(image)
backbone = convnext.ConvNeXtV2Tiny(
input_shape=(IMG_HEIGHT, IMG_WIDTH, 3),
pretrained='imagenet21k-ft1k',
num_classes=0,
)
# CNN Prediction in range [0,1]
features = backbone(image_norm1)
# Average Pooling BxHxWxC -> BxC
pooled_features = tf.keras.layers.GlobalAveragePooling2D()(features)
dropout_features = tf.keras.layers.Dropout(0.30)(lstm_features)
# Output value between [0, 1] using Sigmoid function
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dropout_features)
# Loss
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model = tf.keras.models.Model(inputs=image, outputs=outputs)
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
return model
When I set mixed_precision.Policy('mixed_float16'), the prediction of model is Nan. However, with mixed_precision.Policy('float32'), the prediction of model is between 0 and 1.
Is there any reason why mixed precision causes NaN value?

Related

How to stop training CNN part while continue training ANN part in a Multi-input Model?

I made a multi-input model in Keras which takes image shape=[N, 640, 480, 3] as well as numerical data shape=[N, 19] and does prediction on 12 classes.
Following is the model defining part of code:
# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# # MODEL === CNN
# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#
base_model = keras.applications.ResNet50(
weights='imagenet', # Load weights pre-trained on ImageNet.
input_shape=(640, 480, 3),
include_top=False) # Do not include the ImageNet classifier at the top.
base_model.trainable = False
input_Cnn = keras.Input(shape=(640, 480, 3))
x = base_model(input_Cnn, training=False)
# Convert features of shape `base_model.output_shape[1:]` to vectors
x = keras.layers.GlobalAveragePooling2D()(x)
# A Dense classifier with a single unit (binary classification)
x1 = keras.layers.Dense(1024, activation="relu")(x)
out_Cnn = keras.layers.Dense(12, activation="relu")(x1)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# MODEL === NN
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
inp_num = keras.layers.Input(shape=(19,)) # no. of columns of the numerical data
fc1 = keras.layers.Dense(units=2 ** 6, activation="relu")(inp_num)
fc2 = keras.layers.Dense(units=2 ** 8, activation="relu")(fc1)
fc3 = keras.layers.Dense(units=2 ** 10, activation="relu")(fc2)
fc4 = keras.layers.Dense(units=2 ** 8, activation="relu")(fc3)
fc5 = keras.layers.Dense(units=2 ** 6, activation="relu")(fc4)
out_NN = keras.layers.Dense(12, activation="relu")(fc5)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# CONCATENATION
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
result = keras.layers.concatenate((out_Cnn, out_NN), axis=-1) # [N, 12] --- concatenate [N, 12] ==> [N, 24]
result = keras.layers.Dense(1024, activation='relu')(result)
result = keras.layers.Dense(units=12, activation="softmax")(result)
model = keras.Model([input_Cnn, inp_num], result)
print(model.summary())
Problem is that the CNN part (if independently trained) trains in a less number of epochs while the ANN part (if independently trained) takes a longer time (more epochs). But here in this code when both are combined, accuracy doesn't go beyond 10%. Is there any way to stop gradients flowing into the CNN part after a certain number of epochs so that after that model trains only the ANN part?
Im not using keras but after a quick google search this should be the answer:
You can freeze layers, so that certain parameters are not learnable anymore:
# this freezes the first N layers
for layer in model.layers[:N]:
layer.trainable = False
Where N is the amount of convolutional layers you have.

How to use a pre-trained embedding matrix in tensorflow 2.0 RNN as initial weights in an embedding layer?

I'd like to use a pretrained GloVe embedding as the initial weights for an embedding layer in an RNN encoder/decoder. The code is in Tensorflow 2.0. Simply adding the embedding matrix as a weights = [embedding_matrix] parameter to the tf.keras.layers.Embedding layer won't do it because the encoder is an object and I'm not sure now to effectively pass the embedding_matrix to this object at training time.
My code closely follows the neural machine translation example in the Tensorflow 2.0 documentation. How would I add a pre-trained embedding matrix to the encoder in this example? The encoder is an object. When I get to training, the GloVe embedding matrix is unavailable to the Tensorflow graph. I get the error message:
RuntimeError: Cannot get value inside Tensorflow graph function.
The code uses the GradientTape method and teacher forcing in the training process.
I've tried modifying the encoder object to include the embedding_matrix at various points, including in the encoder's init, call and initialize_hidden_state. All of these fail. The other questions on stackoverflow and elsewhere are for Keras or older versions of Tensorflow, not Tensorflow 2.0.
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])
self.gru = tf.keras.layers.GRU(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state = hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))
# ... Bahdanau Attention, Decoder layers, and train_step defined, see link to full tensorflow code above ...
# Relevant training code
EPOCHS = 10
training_record = pd.DataFrame(columns = ['epoch', 'training_loss', 'validation_loss', 'epoch_time'])
for epoch in range(EPOCHS):
template = 'Epoch {}/{}'
print(template.format(epoch +1,
EPOCHS))
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
total_val_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
if batch % 100 == 0:
template = 'batch {} ============== train_loss: {}'
print(template.format(batch +1,
round(batch_loss.numpy(),4)))
I was trying to do the same thing and getting the exact same error. The problem was that weights in the Embedding layer is currently deprecated. Changing weights= to embeddings_initializer= worked for me.
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
trainable=False)
firslty : load pretrained embedding matrix using
def pretrained_embeddings(file_path, EMBEDDING_DIM, VOCAB_SIZE, word2idx):
# 1.load in pre-trained word vectors #feature vector for each word
print("graph in function",tf.get_default_graph())
print('Loading word vectors...')
word2vec = {}
with open(os.path.join(file_path+'.%sd.txt' % EMBEDDING_DIM), errors='ignore', encoding='utf8') as f:
# is just a space-separated text file in the format:
# word vec[0] vec[1] vec[2] ...
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))
# 2.prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = VOCAB_SIZE
# initialization by zeros
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
if i < VOCAB_SIZE:
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all zeros.
embedding_matrix[i] = embedding_vector
return embedding_matrix
2-then update Encoder class as following:
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz,embedding_matrix):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])
self.gru = tf.keras.layers.GRU(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state = hidden)
return output, state
def initialize_hidden_state(self):
return tf.zeros((self.batch_sz, self.enc_units))
3-calling function that loads pre-trained embedding to get embedding matrix
embedding_matrix = pretrained_embeddings(file_path, EMBEDDING_DIM,vocab_size, word2idx)
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE,embedding_matrix)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))
Note : this works on tensorflow 1.13.1 well

implement one RNN layer in deep DAE seems worse performance

I was trying to implement one RNN layer in deep DAE which is shown in the figure:
DRDAE:
My code is modified based on the DAE tutorial, I change one layer to basic LSTM RNN layer. It somehow can works. The noise in output among different pictures seems lies in same places.
However, compared to both only one layer of RNN and the DAE tutorial, the performance of the structure is much worse. And it requires much more iteration to reach a lower cost.
Can someone help why does the structure got worse result? Below is my code for DRDAE.
# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
# Parameters
learning_rate = 0.0001
training_epochs = 50001
batch_size = 256
display_step = 500
examples_to_show = 10
total_batch = int(mnist.train.num_examples/batch_size)
# Network Parameters
n_input = 784 # data input
n_hidden_1 = 392 # 1st layer num features
n_hidden_2 = 196 # 2nd layer num features
n_steps = 14
# tf Graph input
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_input])
weights = {
'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'decoder_h1': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])),
'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
'decoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'decoder_b2': tf.Variable(tf.random_normal([n_input])),
}
def RNN(x, size, weights, biases):
# Prepare data shape to match `rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input)
# Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
# Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
x = tf.split(x,n_steps,1)
# Define a lstm cell with tensorflow
lstm_cell = rnn.BasicLSTMCell(size, forget_bias=1.0)
# Get lstm cell output
outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights) + biases
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
return layer_2
# Building the decoder
def decoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = RNN(x, n_hidden_2, weights['decoder_h1'],biases['decoder_b1'])
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
return layer_2
# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the original data.
y_true = Y
# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y_true,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
#with tf.device("/cpu:0"):
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
# Loop over all batches
for i in range(total_batch):
batch, _ = mnist.train.next_batch(batch_size)
origin = batch
# Run optimization op (backprop) and cost op (to get loss value)
sess.run(optimizer, feed_dict={X: batch, Y: origin})
# Display logs per epoch step
if epoch % display_step == 0:
c, acy = sess.run([cost, accuracy], feed_dict={X: batch, Y: origin})
print("Epoch:", '%05d' % (epoch+1), "cost =", "{:.9f}".format(c), "accuracy =", "{:.3f}".format(acy))
print("Optimization Finished!")
# Applying encode and decode over test set
encode_decode = sess.run(
y_pred, feed_dict={X: mnist.test.images[:examples_to_show]})
# Compare original images with their reconstructions
f, a = plt.subplots(2, 10, figsize=(10, 2))
for i in range(examples_to_show):
a[0][i].imshow(np.reshape(mnist.test.images[i], (28, 28)))
a[1][i].imshow(np.reshape(encode_decode[i], (28, 28)))

Tensorflow Autoencoder - How To Calculate Reconstruction Error?

I've implemented the following Autoencoder in Tensorflow as shown below. It basically takes MNIST digits as inputs, learns the structure of the data and reproduces the input at its output.
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
# Parameters
learning_rate = 0.01
training_epochs = 20
batch_size = 256
display_step = 1
examples_to_show = 10
# Network Parameters
n_hidden_1 = 256 # 1st layer num features
n_hidden_2 = 128 # 2nd layer num features
n_input = 784 # MNIST data input (img shape: 28*28)
# tf Graph input (only pictures)
X = tf.placeholder("float", [None, n_input])
weights = {
'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'decoder_h1': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_1])),
'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
}
biases = {
'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
'decoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
'decoder_b2': tf.Variable(tf.random_normal([n_input])),
}
# Building the encoder
def encoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
biases['encoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
biases['encoder_b2']))
return layer_2
# Building the decoder
def decoder(x):
# Encoder Hidden layer with sigmoid activation #1
layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
biases['decoder_b1']))
# Decoder Hidden layer with sigmoid activation #2
layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
biases['decoder_b2']))
return layer_2
# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X
# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
total_batch = int(mnist.train.num_examples/batch_size)
# Training cycle
for epoch in range(training_epochs):
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={X: batch_xs})
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1),
"cost=", "{:.9f}".format(c))
print("Optimization Finished!")
# Applying encode and decode over test set
encode_decode = sess.run(
y_pred, feed_dict={X: mnist.test.images[:examples_to_show]})
# Compare original images with their reconstructions
f, a = plt.subplots(2, 10, figsize=(10, 2))
for i in range(examples_to_show):
a[0][i].imshow(np.reshape(mnist.test.images[i], (28, 28)))
a[1][i].imshow(np.reshape(encode_decode[i], (28, 28)))
f.show()
plt.draw()
plt.waitforbuttonpress()
When I am encoding and decoding over the test set, how do I calculate the reconstruction error (i.e. the Mean Squared Error/Loss) for each sample?
In other words I'd like to see how well the Autoencoder is able to reconstruct its input so that I can use the Autoencoder as a single-class classifier.
Many thanks in advance.
Barry
You can take the output of the decoder and take the difference with the true image and take the average.
Say y is the output of the decoder and the original test image is x then you can do something like for each of the examples and take an average over it:
tf.square(y-x)
This will be your reconstruction cost for the test set.

ValueError: No gradients provided for any variable

I'm facing a trouble with tensorFlow. Executing the following code
import tensorflow as tf
import input_data
learning_rate = 0.01
training_epochs = 25
batch_size = 100
display_step = 1
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# tensorflow graph input
X = tf.placeholder('float', [None, 784]) # mnist data image of shape 28 * 28 = 784
Y = tf.placeholder('float', [None, 10]) # 0-9 digits recognition = > 10 classes
# set model weights
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
# Our hypothesis
activation = tf.add(tf.matmul(X, W),b) # Softmax
# Cost function: cross entropy
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=activation, logits=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Gradient Descen
I get the following error:
ValueError: No gradients provided for any variable, check your graph
for ops that do not support gradients, between variables
['Tensor("Variable/read:0", shape=(784, 10), dtype=float32)',
'Tensor("Variable_1/read:0", shape=(10,), dtype=float32)'] and loss
Tensor("Mean:0", shape=(), dtype=float32).
This problem is caused by the following line: tf.nn.softmax_cross_entropy_with_logits(labels=activation, logits=Y)
Based on documentation you should have
labels: Each row labels[i] must be a valid probability distribution.
logits: Unscaled log probabilities.
So logits suppose to be your hypothesis and thus equal to activation and valid probability distribution is Y. So just change it with tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=activation)
I ended up here because I had passed my input X data to my model, but not my expected outputs. I had:
model.fit(X, epochs=30) # whoops!
I should have had:
model.fit(X, y, epochs=30) # fixed!
In my case I've forgotten to add the compile layer to the model
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics = ['accuracy'] )