How can I print the loss value obtained from model.add_loss() function in Tensorflow 2 (with Keras API)? - tensorflow2.0

I have a classification model in Keras API of Tensorflow 2. The model has two losses in the example below, categorical cross entropy and KL-divergence. Categorical cross-entropy is being added while initializing the model and hence it can be printed while training. KL-divergence is being added separately using model.add_loss(). However, it does not display during training. Is there a way to print it while training? The sample code is shown below. Since it is on simulated data, loss value might come to be nan.
import tensorflow as tf
keras = tf.keras
import keras.backend as K
import numpy as np
def ohc(y):
y = np.random.randint(4, size=100)
b = np.zeros((y.size, y.max() + 1))
b[np.arange(y.size), y] = 1
return b
xtrain = np.random.rand(100,50)
y = np.random.randint(4, size=100)
y_one_hot = ohc(y)
def my_kld(y_true, y_pred):
y_pred2 = tf.keras.layers.Lambda(lambda x: x + 0.00001)(y_pred)
LR = y_true/y_pred2
logLR = K.log(LR)
kld = y_true*logLR
loss = K.mean(kld)
loss = tf.keras.layers.Lambda(lambda x: x * 0.2)(loss)
return loss
x_t = keras.layers.Input((50,))
y_t = keras.layers.Input((4,))
x1 = keras.layers.Dense(100, activation = 'relu')(x_t)
x2 = keras.layers.Dense(4, activation = 'softmax')(x1)
model = keras.models.Model([x_t,y_t], x2)
model.add_loss(my_kld(y_t, x2))
optim = keras.optimizers.Nadam(0.00006)
model.compile(loss=['categorical_crossentropy'], optimizer=optim, metrics=['accuracy'])
model.fit(x=[xtrain,y_one_hot], y = y_one_hot, epochs = 100)
The code that I have tried has been included in the question in an implementable way. Hwever, there does not seem to be any way to print the loss from model.add_loss().

Related

Training `tfd.MixtureSameFamily` gets 'NaN'

I want to train a mixture density model using tfd.MixtureSameFamily. But after several thousand epochs of training, the result gets NaN. Here's full functioning code to replicate this.
import section
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
import numpy as np
data generation
number_of_instances = 1000
x_data = np.linspace(-5.5,5.5,number_of_instances)
r_data = np.random.randn(number_of_instances)
y_data = 7*np.sin(x_data*0.75)+ x_data + r_data
x_data = x_data.astype("float32")
x_data = x_data.reshape(x_data.size,1)
y_data = y_data.astype("float32")
y_data = y_data.reshape(y_data.size,1)
model building section
hidden_units = 100
k_mixt = 5
l2_reg = 1e-3
learning_rate = 1e-3
hidden_dense = Dense(units=hidden_units,
input_dim=y_data,
activation=tf.nn.relu,
kernel_regularizer=regularizers.l2(l2_reg),
name=f'Dense',
trainable=True)
alpha_dense = Dense(units=k_mixt,
activation=tf.nn.softmax,
name='alpha',
trainable=True)
mu_dense = Dense(units=k_mixt,
activation=None,
name='mu',
trainable=True)
sigma_dense = Dense(k_mixt,
activation=tf.nn.softplus,
name='sigma',
trainable=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
training section
for epoch in range(int(5e4)):
with tf.GradientTape() as tape:
hidden = hidden_dense(y_data)
alpha = alpha_dense(hidden)
mu = mu_dense((hidden))
sigma=sigma_dense(hidden)
gm = tfd.MixtureSameFamily(mixture_distribution=tfd.Categorical(probs=alpha),components_distribution=tfd.Normal(loc=mu, scale=sigma))
loss = -tf.reduce_sum(gm.log_prob(tf.reshape(x_data,(-1,))))
grads = tape.gradient(loss,tape.watched_variables())
(grads_clipped, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
optimizer.apply_gradients(zip(grads_clipped, tape.watched_variables()))
if epoch%5e2 == 0:
print(epoch, loss)
What I found is that the nan first appears sigma_dense layer and hidden_dense layer in some epoch and then spread to all other layers. It seems that the cause of Nan is calculating gradient of loss respect to sigma.
As I learnt from a youtube video, the gradient of loss respect to sigma is:
d(ln(loss)/d(sigma) = -n/sigma + 1/sigma^3 * ((x1 - mu)^2 + ... + (xn - mu)^2)
Could this derivative formula be the cause of nan? Does anyone have any idea?

Loss function with derivative in TensorFlow 2

I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help
Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")

How to multiply a layer by a constant vector element wise in Keras?

I want to make a weighted average ensemble of 3 of my trained models. So, I want first to multiply the softmax output of a model (element-wise) by a vector and then average the 3 weighted outputs of the 3 models.
I used the following code to multiply the output of the first model by its weight vector:
from keras.layers import Multiply, Average
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_tensor])
print(resnet_weighted)
new_model=Model(model.input, resnet_weighted)
However, I'm stuck with the following error:
What can I do?
Use Lambda instead of Multiply, and K.constant instead of tf.constant (is backend-neutral):
resnet_weight_tensor=K.constant(resnet_weights, 'float32')
out = finetuned_model.layers[-1].output
resnet_weighted = Lambda(lambda x: x * resnet_weight_tensor)(out)
FULL EXAMPLE:
## BUILD MODELS
batch_size = 32
num_batches = 100
input_shape = (4,)
num_classes = 3
model_1 = make_model(input_shape, 8, num_classes)
model_2 = make_model(input_shape, 10, num_classes)
model_3 = make_model(input_shape, 12, num_classes)
## BUILD ENSEMBLE
models = (model_1, model_2, model_3)
models_ins = [model.input for model in models]
models_outs = [model.input for model in models]
outputs_weights = [np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes)),
np.random.random((batch_size, num_classes))]
outs_avg = model_outputs_average(models, outputs_weights)
final_out = Dense(num_classes, activation='softmax')(outs_avg)
model_ensemble = Model(inputs=models_ins, outputs=final_out)
model_ensemble.compile('adam', loss='categorical_crossentropy')
### TEST ENSEMBLE
x1 = np.random.randn(batch_size, *input_shape) # toy data
x2 = np.random.randn(batch_size, *input_shape)
x3 = np.random.randn(batch_size, *input_shape)
y = np.random.randint(0,2,(batch_size, num_classes)) # toy labels
model_ensemble.fit([x1,x2,x3], y)
Verify averaging:
[print(layer.name) for layer in model_ensemble.layers] # show layer names
preouts1 = get_layer_outputs(model_ensemble, 'lambda_1', [x1,x2,x3])
preouts2 = get_layer_outputs(model_ensemble, 'lambda_2', [x1,x2,x3])
preouts3 = get_layer_outputs(model_ensemble, 'lambda_3', [x1,x2,x3])
preouts_avg = get_layer_outputs(model_ensemble, 'average_1',[x1,x2,x3])
preouts = np.asarray([preouts1, preouts2, preouts3])
sum_of_diff_of_means = np.sum(np.mean(preouts, axis=0) - preouts_avg)
print(np.sum(np.mean([preouts1, preouts2, preouts3],axis=0) - preouts_avg))
# 4.69e-07
Functions used:
def make_model(input_shape, dense_dim, num_classes=3):
ipt = Input(shape=input_shape)
x = Dense(dense_dim, activation='relu')(ipt)
out = Dense(num_classes, activation='softmax')(x)
model = Model(ipt, out)
model.compile('adam', loss='categorical_crossentropy')
return model
def model_outputs_average(models, outputs_weights):
outs = [model.output for model in models]
out_shape = K.int_shape(outs[0])[1:] # ignore batch dim
assert all([(K.int_shape(out)[1:] == out_shape) for out in outs]), \
"All model output shapes must match"
outs_weights = [K.constant(w, 'float32') for w in outputs_weights]
ow_shape = K.int_shape(outs_weights[0])
assert all([(K.int_shape(w) == ow_shape) for w in outs_weights]), \
"All outputs_weights and model.output shapes must match"
weights_layers = [Lambda(lambda x: x * ow)(out) for ow, out
in zip(outs_weights, outs)]
return Average()(weights_layers)
def get_layer_outputs(model,layer_name,input_data,train_mode=False):
outputs = [layer.output for layer in model.layers if layer_name in layer.name]
layers_fn = K.function([model.input, K.learning_phase()], outputs)
return [layers_fn([input_data,int(train_mode)])][0][0]
The bug is possibly caused by the mixture of kears api and tensorflow api, since your resnet_weight_tensor is a tensor from tensorflow api, while finetuned_model.layers[-1].output is the output from a keras layer. Some discusses can be seen here issue 7362
One walk around is to wrap resnet_weight_tensor into keras Input layer.
from keras.layers import Multiply, Average, Input
resnet_weights = np.asarray([[0.91855, 0.99485, 0.89065, 0.96525, 0.98005,
0.93645, 0.6149, 0.934, 0.92505, 0.785, 0.85]], np.float32)
resnet_weight_tensor=tf.constant(resnet_weights, np.float32)
resnet_weight_input = Input(tensor=resnet_weight_tensor)
sess = tf.InteractiveSession()
print(resnet_weight_tensor.eval())
sess.close()
resnet_weighted = Multiply()([finetuned_model.layers[-1].output, resnet_weight_input])
print(resnet_weighted)
new_model=Model([model.input, resnet_weight_input], resnet_weighted)

Simple custom layer in keras, tensorflow confusion

First before I explain, here's the relevant code snippet:
input = Input(shape=(784, ))
hidden1 = Dense(784, activation='relu')(input)
hidden2 = Dense(784, activation='relu')(hidden1)
hidden3 = Dense(1568, activation='relu')(hidden2)
hidden4 = Lambda(lambda x: makeComplex(x))(hidden3)
hidden5 = Reshape((1, 28, 28))(hidden4)
hidden6 = Lambda(lambda x: ifft2(x))(hidden5)
hidden7 = Flatten()(hidden6)
output = Dense(train_targets.shape[1], activation='linear')(hidden7)
model = Model(inputs=input, outputs=output)
print(model.summary())
where ifft2(x) is
def ifft2(x):
import tensorflow as tf
return tf.cast(tf.spectral.ifft2d(tf.cast(x,dtype=tf.complex64)),tf.float32)
My goal now is to implement the makeComplex method.
Basically, it gets a vector of size 1568, and I want it to return a vector of size 784 in the following very simple fashion:
new[k] = old[k] + old[k + 1] * i where i is the imaginary unit
Here's my attempt:
def makeComplex(x):
y = np.zeros((1, 784))
for i in range(784):
y[i] = np.complex(x[i], x[i + 1])
return y
ofcourse this doesnt work because x is not actually a vector but rather a tensorflow tensor. Something I know nothing about. How can I make this work?
An example of tensor [1.,2.,3.,4.], what you want is [1.+2.j,3.+4.j]. I think you can use tf.gather to get two tensors [1.,2.] and [3.,4.], then use tf.complex to get the answer.
import tensorflow as tf
import numpy as np
a = tf.constant([1.,2.,3.,4.])
real = tf.gather(a,np.arange(0,a.get_shape().as_list()[0],2))
imag = tf.gather(a,np.arange(1,a.get_shape().as_list()[0],2))
res = tf.complex(real, imag)

SGD converges but batch learning does not, simple regression in tensorflow

I have run into an issue where batch learning in tensorflow fails to converge to the correct solution for a simple convex optimization problem, whereas SGD converges. A small example is found below, in the Julia and python programming languages, I have verified that the same exact behaviour results from using tensorflow from both Julia and python.
I'm trying to fit the linear model y = s*W + B with parameters W and B
The cost function is quadratic, so the problem is convex and should be easily solved using a small enough step size. If I feed all data at once, the end result is just a prediction of the mean of y. If, however, I feed one datapoint at the time (commented code in julia version), the optimization converges to the correct parameters very fast.
I have also verified that the gradients computed by tensorflow differs between the batch example and summing up the gradients for each datapoint individually.
Any ideas on where I have failed?
using TensorFlow
s = linspace(1,10,10)
s = [s reverse(s)]
y = s*[1,4] + 2
session = Session(Graph())
s_ = placeholder(Float32, shape=[-1,2])
y_ = placeholder(Float32, shape=[-1,1])
W = Variable(0.01randn(Float32, 2,1), name="weights1")
B = Variable(Float32(1), name="bias3")
q = s_*W + B
loss = reduce_mean((y_ - q).^2)
train_step = train.minimize(train.AdamOptimizer(0.01), loss)
function train_critic(s,targets)
for i = 1:1000
# for i = 1:length(y)
# run(session, train_step, Dict(s_ => s[i,:]', y_ => targets[i]))
# end
ts = run(session, [loss,train_step], Dict(s_ => s, y_ => targets))[1]
println(ts)
end
v = run(session, q, Dict(s_ => s, y_ => targets))
plot(s[:,1],v, lab="v (Predicted value)")
plot!(s[:,1],y, lab="y (Correct value)")
gui();
end
run(session, initialize_all_variables())
train_critic(s,y)
Same code in python (I'm not a python user so this might be ugly)
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import tensorflow as tf
from tensorflow.python.framework.ops import reset_default_graph
s = np.linspace(1,10,50).reshape((50,1))
s = np.concatenate((s,s[::-1]),axis=1).astype('float32')
y = np.add(np.matmul(s,[1,4]), 2).astype('float32')
reset_default_graph()
rng = np.random
s_ = tf.placeholder(tf.float32, [None, 2])
y_ = tf.placeholder(tf.float32, [None])
weight_initializer = tf.truncated_normal_initializer(stddev=0.1)
with tf.variable_scope('model'):
W = tf.get_variable('W', [2, 1],
initializer=weight_initializer)
B = tf.get_variable('B', [1],
initializer=tf.constant_initializer(0.0))
q = tf.matmul(s_, W) + B
loss = tf.reduce_mean(tf.square(tf.sub(y_ , q)))
optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss)
num_epochs = 200
train_cost= []
with tf.Session() as sess:
init = tf.initialize_all_variables()
sess.run(init)
for e in range(num_epochs):
feed_dict_train = {s_: s, y_: y}
fetches_train = [train_op, loss]
res = sess.run(fetches=fetches_train, feed_dict=feed_dict_train)
train_cost = [res[1]]
print train_cost
The answer turned out to be that when I fed in the targets, I fed a vector and not an Nx1 matrix. The operation y_-q then turned into a broadcast operation and instead of returning the elementwise difference, it returned an NxN matrix with the desired difference along the diagonal. In Julia, I solved this by modifying the line
train_critic(s,y)
to
train_critic(s,reshape(y, length(y),1))
to ensure y being a matrix.
A subtle error that took me a very long time to find! Part of the confusion was that TensorFlow seems to treat vectors as row vectors and not as column vectors like Julia, hence the broadcast operation in y_-q