Third derivative of an operator gives nan in tensorflow2.0 - tensorflow2.0

I use tensorflow2.0 gpu version.
I simplified my example and try to calculate third derivative of a Gaussian correlation function.
def cov_mat(x1,x2):
res = (x1-x2)**2
res = tf.exp(-res/2)
return res
This is how I construct gradients in graph mode:
#tf.function
def operator(x,y):
k = cov_mat(x, y)
grad = tf.gradients(k,x)[0]
grad2 = tf.gradients(grad,x)[0]
grad3 = tf.gradients(grad2,x)[0]
return grad, grad2, grad3
Now if I take sample
grid = np.array([[0.]]).astype('float64')
grid = tf.convert_to_tensor(grid)
grad, grad2, grad3 = operator(grid,grid)
print(grad)
print(grad2)
print(grad3)
It returns
tf.Tensor([[-0.]], shape=(1, 1), dtype=float64)
tf.Tensor([[-1.]], shape=(1, 1), dtype=float64)
tf.Tensor([[nan]], shape=(1, 1), dtype=float64)
If I take different values in grid, it works well.
So, why it gives 'nan' when x=y?

Related

No gradients provided for any variables -Custom loss function

I am trying to train a network with custom loss function and I am getting an error:
ValueError: No gradients provided for any variable: ['conv2d/kernel:0', 'conv2d/bias:0', 'conv2d_1/kernel:0', 'conv2d_1/bias:0', 'conv2d_2/kernel:0', 'conv2d_2/bias:0', 'conv2d_3/kernel:0', 'conv2d_3/bias:0', 'conv2d_4/kernel:0', 'conv2d_4/bias:0', 'dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
The custom loss function is:
def cosine_sim_cal(self, vec1, vec2):
vec1 = tf.convert_to_tensor([vec1])
vec2 = tf.convert_to_tensor([vec2])
cosine_loss = tf.keras.metrics.CosineSimilarity(axis=1)
cosine_loss.update_state(vec1,vec2)
return cosine_loss.result()
def triplets_loss(self, y_pred, m):
eps = tf.keras.backend.epsilon()
loss = 0.0
for i in range(len(y_pred)):
d_a_p = self.cosine_sim_cal(y_pred[i, 0, :], y_pred[i, 1, :])
d_a_n = self.cosine_sim_cal(y_pred[i, 0, :], y_pred[i, 2, :])
loss += tf.math.maximum((d_a_p - d_a_n + m), eps)
return loss
The shape of y_pred is TensorShape([180, 3, 128]) and m is a float value. The loss function is computing the loss which looks like tf.Tensor(37.054775, shape=(), dtype=float32)
My training loops is:
model = self.model
train_loss_list = []
validation_loss_list = []
train_triplet_gen_instance = Triplet_Generator(x_data=self.train_class_dict, batch=self.batch)
val_triplet_gen_instance = Triplet_Generator(x_data=self.val_class_dict, batch=self.batch)
for epoch in range(self.epochs):
total_train_loss = 0.0
total_val_loss = 0.0
for step in range(self.training_steps):
x_train_batch = train_triplet_gen_instance.generate_batch()
with tf.GradientTape() as tape:
train_logits = model(x_train_batch, training=True)
train_loss_value = self.triplets_loss(train_logits, m)
total_train_loss += train_loss_value
grads = tape.gradient(train_loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
if step%20==0:
print('Epoch: {}, Step: {}, training_loss:{}'.format(epoch, step, str(total_train_loss/step)))
mean_training_loss = tf.divide(total_train_loss, self.training_steps)
train_loss_list.append(mean_training_loss.numpy())
x_train_batch is a tuple of length 3. every element of this tuple is of shape (180, 200, 200, 3)
I am not able to figure out the bug in the code. If I change my loss function to a distance based loss function, the code works.
I found the problem in the custom loss function. It seems that tf.keras.metrics.CosineSimilarity(axis=1) is not differentiable for which the gradients were not being calculated. For this, I tried to rewrite the function as :
def triplets_loss(self, y_pred, m):
eps = tf.keras.backend.epsilon()
d_a_p = tf.convert_to_tensor(list(map(lambda x, y: tf.tensordot(x,y, axes=1)/(tf.norm(x)*tf.norm(y)), y_pred[:,0,:], y_pred[:,1,:])))
d_a_n = tf.convert_to_tensor(list(map(lambda x, y: tf.tensordot(x,y, axes=1)/(tf.norm(x)*tf.norm(y)), y_pred[:,0,:], y_pred[:,2,:])))
loss = tf.reduce_sum(tf.math.maximum((d_a_p - d_a_n + m), eps))
return loss
With the new loss function, I was able to continue with the training.

Chamfer distance Raises Error in a Keras autoencoder model

I am currently designing an auto-encoder using PointNet algorithm, I am facing a problem, apparently in my loss function Chamfer-distance.
I try to run the training process but the following Error raises.
I would really appreciate your help.
ValueError: Expected list with element dtype float but got list with element dtype double for '{{node gradient_tape/chamfer_distance_tf/map/while/gradients/chamfer_distance_tf/map/while/TensorArrayV2Write/TensorListSetItem_grad/TensorListGetItem}} = TensorListGetItem[element_dtype=DT_FLOAT](gradient_tape/chamfer_distance_tf/map/while/gradients/grad_ys_0, gradient_tape/chamfer_distance_tf/map/while/gradients/chamfer_distance_tf/map/while/TensorArrayV2Write/TensorListSetItem_grad/TensorListSetItem/TensorListPopBack:1, gradient_tape/chamfer_distance_tf/map/while/gradients/chamfer_distance_tf/map/while/TensorArrayV2Write/TensorListSetItem_grad/Shape)' with input shapes: [], [], [0].
ERROR:tensorflow:Error: Input value Tensor("chamfer_distance_tf/map/while/add:0", shape=(), dtype=float32) has dtype <dtype: 'float32'>, but expected dtype <dtype: 'float64'>. This leads to undefined behavior and will be an error in future versions of TensorFlow.
Here is my loss function:
def distance_matrix(array1, array2):
num_point, num_features = array1.shape
expanded_array1 = tf.tile(array1, (num_point, 1))
expanded_array2 = tf.reshape(
tf.tile(tf.expand_dims(array2, 1),
(1, num_point, 1)),
(-1, num_features))
distances = tf.norm(expanded_array1-expanded_array2, axis=1)
distances = tf.reshape(distances, (num_point, num_point))
return distances
def av_dist(array1, array2):
distances = distance_matrix(array1, array2)
distances = tf.reduce_min(distances, axis=1)
distances = tf.reduce_mean(distances)
return distances
def av_dist_sum(arrays):
array1, array2 = arrays
av_dist1 = av_dist(array1, array2)
av_dist2 = av_dist(array2, array1)
return av_dist1+av_dist2
def chamfer_distance_tf(array1, array2):
batch_size, num_point, num_features = array1.shape
dist = tf.reduce_mean(
tf.map_fn(av_dist_sum, elems=(array1, array2), dtype=tf.float64), axis=-1)
return dist
print(chamfer_distance_tf(X,X))
model.compile(
loss= chamfer_distance_tf,
optimizer=keras.optimizers.Adam(learning_rate=0.001),
metrics=["sparse_categorical_accuracy"],)
model.fit(train_data, train_data, epochs=5, batch_size=10)`
and Here is the first part of the code:
list_pcl = []
for filename in os.listdir(directory):
if filename.endswith(".ply") :
directory_file = os.path.join(directory, filename)
print(directory_file)
pcl = PlyData.read(directory_file)
data = pcl.elements[0].data
data = np.asarray(data.tolist())
data.resize(1024,3)
print(type(data))
print(data.shape)
list_pcl.append(data)
print(len(list_pcl))
#X = np.asarray(list_pcl[0:73999])
#X_val = np.asarray(list_pcl[74000:74299])
#X_test = np.asarray(list_pcl[74300:74329])
X = np.asarray(list_pcl[0:200])
X_val = np.asarray(list_pcl[200:220])
X_test = np.asarray(list_pcl[220:228])
random.shuffle(X)
random.shuffle(X_val)
random.shuffle(X_test)
"""**Reshaping the dataset**
The neural network is unable to treat data with different input size, that's why we apply a zero padding to all the data to reach the size of the point cloud data with the biggest number of raws.
We additioally reshape the outcome by adding one dimension corresponidng to the number of channels to the tesors.
"""
train_num = X.shape[0]
val_num = X_val.shape[0]
test_num = X_test.shape[0]
points_num = X.shape[1]
features_num = X.shape[2]
train_data = X.reshape([-1, points_num, features_num]).astype(float)
val_data = X_val.reshape([-1, points_num, features_num]).astype(float)
test_data = X_test.reshape([-1, points_num, features_num]).astype(float)
print(train_data.shape)
print(val_data.shape)
print(test_data.shape)

Why is batch_size being multiplied to GradientTape results in Tensorflow?

I'm trying to get the gradients of a loss function w.r.t to another tensor. But the gradients are being multiplied by input batch size that I feed into my model.
import tensorflow as tf
from tensorflow.keras import Sequential, layers
#Sample States and Returns
states = tf.random.uniform(shape = (100,4))
returns = tf.constant([float(i) for i in range(100)])
#Creating dataset to feed data to model
states = tf.data.Dataset.from_tensor_slices(states)
returns = tf.data.Dataset.from_tensor_slices(returns)
#zipping datasets into one
batch_size = 4
dataset = tf.data.Dataset.zip((states, returns)).batch(batch_size)
model = Sequential([layers.Dense(128, input_shape =(4,), activation = tf.nn.relu),
layers.Dense(1, activation = tf.nn.tanh)])
for state_batch, returns_batch in dataset:
with tf.GradientTape(persistent=True) as tape:
values = model(state_batch)
loss = returns_batch - values
# d_loss/d_values should be -1.0, but i'm getting -1.0 * batch_size
print(tape.gradient(loss,values))
break
Output:
tf.Tensor(
[[-4.]
[-4.]
[-4.]
[-4.]], shape=(4, 1), dtype=float32)
Expected Output:
tf.Tensor(
[[-1.]
[-1.]
[-1.]
[-1.]], shape=(4, 1), dtype=float32)
From the code, you can see that loss = returns - values. So it should be d_loss/d_values = -1.0 , but the result I'm getting is d_loss/d_values = -1.0 * batch_size. Someone please point out why this is happening? How can I get the real results?
colab link : https://colab.research.google.com/drive/1x4pyGJ5ccRVSMzDAeLzcPXRtO7cNFnJf?usp=sharing
The problem is in this line:
loss = returns_batch - values
Here, returns_batch has shape (4,), but values has shape (4, 1). The subtraction operation broadcasts the tensors, resulting in a loss tensor that has shape (4, 4), with four repeated columns. For this reason, changing a single value of values affects four elements of returns_batch, hence the scaled gradient value. You can fix it for example like this:
loss = returns_batch - tf.squeeze(values, axis=1)

How does BatchNormalization work on an example?

I am trying to understand batchnorm.
My humble example
layer1 = tf.keras.layers.BatchNormalization(scale=False, center=False)
x = np.array([[3.,4.]])
out = layer1(x)
print(out)
Prints
tf.Tensor([[2.99850112 3.9980015 ]], shape=(1, 2), dtype=float64)
My attempt to reproduce it
e=0.001
m = np.sum(x)/2
b = np.sum((x - m)**2)/2
x_=(x-m)/np.sqrt(b+e)
print(x_)
It prints
[[-0.99800598 0.99800598]]
What am I doing wrong?
Two problems here.
First, batch norm has two "modes": Training, where normalization is done via the batch statistics, and inference, where normalization is done via "population statistics" that are collected from batches during training. Per default, keras layers/models function in inference mode, and you need to specify training=True in their call to change this (there are other ways, but that is the simplest one).
layer1 = tf.keras.layers.BatchNormalization(scale=False, center=False)
x = np.array([[3.,4.]], dtype=np.float32)
out = layer1(x, training=True)
print(out)
This prints tf.Tensor([[0. 0.]], shape=(1, 2), dtype=float32). Still not right!
Second, batch norm normalizes over the batch axis, separately for each feature. However, the way you specify the input (as a 1x2 array) is basically a single input (batch size 1) with two features. Batch norm just normalizes each feature to mean 0 (standard deviation is not defined). Instead, you want two inputs with a single feature:
layer1 = tf.keras.layers.BatchNormalization(scale=False, center=False)
x = np.array([[3.],[4.]], dtype=np.float32)
out = layer1(x, training=True)
print(out)
This prints
tf.Tensor(
[[-0.99800634]
[ 0.99800587]], shape=(2, 1), dtype=float32)
Alternatively, specify the "feature axis":
layer1 = tf.keras.layers.BatchNormalization(axis=0, scale=False, center=False)
x = np.array([[3.,4.]], dtype=np.float32)
out = layer1(x, training=True)
print(out)
Note that the input shape is "wrong", but we told batchnorm that axis 0 is the feature axis (it defaults to -1, the last axis). This will also give the desired result:
tf.Tensor([[-0.99800634 0.99800587]], shape=(1, 2), dtype=float32)

How to backprop through a model that predicts the weights for another in Tensorflow

I am currently trying to train a model (hypernetwork) that can predict the weights for another model (main network) such that the main network's cross-entropy loss decreases. However when I use tf.assign to assign the new weights to the network it does not allow backpropagation into the hypernetwork thus rendering the system non-differentiable. I have tested whether my weights are properly updated and they seem to be since when subtracting initial weights from updated ones is a non zero sum.
This is a minimal sample of what I am trying to achieve.
import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import softmax
def random_addition(variables):
addition_update_ops = []
for variable in variables:
update = tf.assign(variable, variable+tf.random_normal(shape=variable.get_shape()))
addition_update_ops.append(update)
return addition_update_ops
def network_predicted_addition(variables, network_preds):
addition_update_ops = []
for idx, variable in enumerate(variables):
if idx == 0:
print(variable)
update = tf.assign(variable, variable + network_preds[idx])
addition_update_ops.append(update)
return addition_update_ops
def dense_weight_update_net(inputs, reuse):
with tf.variable_scope("weight_net", reuse=reuse):
output = tf.layers.conv2d(inputs=inputs, kernel_size=(3, 3), filters=16, strides=(1, 1),
activation=tf.nn.leaky_relu, name="conv_layer_0", padding="SAME")
output = tf.reduce_mean(output, axis=[0, 1, 2])
output = tf.reshape(output, shape=(1, output.get_shape()[0]))
output = tf.layers.dense(output, units=(16*3*3*3))
output = tf.reshape(output, shape=(3, 3, 3, 16))
return output
def conv_net(inputs, reuse):
with tf.variable_scope("conv_net", reuse=reuse):
output = tf.layers.conv2d(inputs=inputs, kernel_size=(3, 3), filters=16, strides=(1, 1),
activation=tf.nn.leaky_relu, name="conv_layer_0", padding="SAME")
output = tf.reduce_mean(output, axis=[1, 2])
output = tf.layers.dense(output, units=2)
output = softmax(output)
return output
input_x_0 = tf.zeros(shape=(32, 32, 32, 3))
target_y_0 = tf.zeros(shape=(32), dtype=tf.int32)
input_x_1 = tf.ones(shape=(32, 32, 32, 3))
target_y_1 = tf.ones(shape=(32), dtype=tf.int32)
input_x = tf.concat([input_x_0, input_x_1], axis=0)
target_y = tf.concat([target_y_0, target_y_1], axis=0)
output_0 = conv_net(inputs=input_x, reuse=False)
target_y = tf.one_hot(target_y, 2)
crossentropy_loss_0 = tf.losses.softmax_cross_entropy(onehot_labels=target_y, logits=output_0)
conv_net_parameters = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="conv_net")
weight_net_parameters = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="weight_net")
print(conv_net_parameters)
weight_updates = dense_weight_update_net(inputs=input_x, reuse=False)
#updates_0 = random_addition(conv_net_parameters)
updates_1 = network_predicted_addition(conv_net_parameters, network_preds=[weight_updates])
with tf.control_dependencies(updates_1):
output_1 = conv_net(inputs=input_x, reuse=True)
crossentropy_loss_1 = tf.losses.softmax_cross_entropy(onehot_labels=target_y, logits=output_1)
check_sum = tf.reduce_sum(tf.abs(output_0 - output_1))
c_opt = tf.train.AdamOptimizer(beta1=0.9, learning_rate=0.001)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Needed for correct batch norm usage
with tf.control_dependencies(update_ops): # Needed for correct batch norm usage
train_variables = weight_net_parameters #+ conv_net_parameters
c_error_opt_op = c_opt.minimize(crossentropy_loss_1,
var_list=train_variables,
colocate_gradients_with_ops=True)
init=tf.global_variables_initializer()
with tf.Session() as sess:
init = sess.run(init)
loss_list_0 = []
loss_list_1 = []
for i in range(1000):
_, checksum, crossentropy_0, crossentropy_1 = sess.run([c_error_opt_op, check_sum, crossentropy_loss_0,
crossentropy_loss_1])
loss_list_0.append(crossentropy_0)
loss_list_1.append(crossentropy_1)
print(checksum, np.mean(loss_list_0), np.mean(loss_list_1))
Does anyone know how I can get tensorflow to compute the gradients for this? Thank you.
In this case your weights aren't variables, they are computed tensors based on the hypernetwork. All you really have is one network during training. If I understand you correctly you are then proposing to discard the hypernetwork and be able to use just the main network to perform predictions.
If this is the case then you can either save the weight values manually and reload them as constants, or you could use tf.cond and tf.assign to assign them as you are doing during training, but use tf.cond to choose to use the variable or the computed tensor depending on whether you're doing training or inference.
During training you will need to use the computed tensor from the hypernetwork in order to enable backprop.
Example from comments, w is the weight you'll use, you can assign a variable during training to keep track of it, but then use tf.cond to either use the variable (during inference) or the computed value from the hypernetwork (during training). In this example you need to pass in a boolean placeholder is_training_placeholder to indicate if you're running training of inference.
tf.assign(w_variable, w_from_hypernetwork)
w = tf.cond(is_training_placeholder, true_fn=lambda: w_from_hypernetwork, false_fn=lambda: w_variable)