Loss function with derivative in TensorFlow 2 - tensorflow

I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help

Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")

Related

Training `tfd.MixtureSameFamily` gets 'NaN'

I want to train a mixture density model using tfd.MixtureSameFamily. But after several thousand epochs of training, the result gets NaN. Here's full functioning code to replicate this.
import section
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
import numpy as np
data generation
number_of_instances = 1000
x_data = np.linspace(-5.5,5.5,number_of_instances)
r_data = np.random.randn(number_of_instances)
y_data = 7*np.sin(x_data*0.75)+ x_data + r_data
x_data = x_data.astype("float32")
x_data = x_data.reshape(x_data.size,1)
y_data = y_data.astype("float32")
y_data = y_data.reshape(y_data.size,1)
model building section
hidden_units = 100
k_mixt = 5
l2_reg = 1e-3
learning_rate = 1e-3
hidden_dense = Dense(units=hidden_units,
input_dim=y_data,
activation=tf.nn.relu,
kernel_regularizer=regularizers.l2(l2_reg),
name=f'Dense',
trainable=True)
alpha_dense = Dense(units=k_mixt,
activation=tf.nn.softmax,
name='alpha',
trainable=True)
mu_dense = Dense(units=k_mixt,
activation=None,
name='mu',
trainable=True)
sigma_dense = Dense(k_mixt,
activation=tf.nn.softplus,
name='sigma',
trainable=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
training section
for epoch in range(int(5e4)):
with tf.GradientTape() as tape:
hidden = hidden_dense(y_data)
alpha = alpha_dense(hidden)
mu = mu_dense((hidden))
sigma=sigma_dense(hidden)
gm = tfd.MixtureSameFamily(mixture_distribution=tfd.Categorical(probs=alpha),components_distribution=tfd.Normal(loc=mu, scale=sigma))
loss = -tf.reduce_sum(gm.log_prob(tf.reshape(x_data,(-1,))))
grads = tape.gradient(loss,tape.watched_variables())
(grads_clipped, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
optimizer.apply_gradients(zip(grads_clipped, tape.watched_variables()))
if epoch%5e2 == 0:
print(epoch, loss)
What I found is that the nan first appears sigma_dense layer and hidden_dense layer in some epoch and then spread to all other layers. It seems that the cause of Nan is calculating gradient of loss respect to sigma.
As I learnt from a youtube video, the gradient of loss respect to sigma is:
d(ln(loss)/d(sigma) = -n/sigma + 1/sigma^3 * ((x1 - mu)^2 + ... + (xn - mu)^2)
Could this derivative formula be the cause of nan? Does anyone have any idea?

ValueError: Input 0 of layer “Discriminator” is incompatible with the layer: expected shape=(None, 3), found shape=(100, 2)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score,\
accuracy_score, balanced_accuracy_score,classification_report,\
plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, Concatenate
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal
import tensorflow.keras.backend as K
from sklearn.utils import shuffle
import pickle
from tqdm import tqdm
import numpy as np
from scipy import stats
import pandas as pd
np.random.seed(1635848)
def get_data_XYZ_one_dimensional(n, a=-2, c=1/2, random_state=None, verbose=True):
"""
Generates pseudo-random data distributed according to the distribution defined in section 2.1 of the document
"Math/Confounders and data generation.pdf".
:param n: Number of data points to generate.
:param a: Mean of X.
:param c: Shape parameter for Weibull distribution.
:param random_state: Used to set the seed of numpy.random before generation of random numbers.
:param verbose: If True will display a progress bar. If False it will not display a progress bar.
:return: Pandas DataFrame with three columns (corresponding to X, Y and Z) and n rows (corresponding to the n
generated pseudo-random samples).
"""
np.random.seed(random_state)
output = []
iterator = tqdm(range(n)) if verbose else range(n)
for _ in iterator:
X = stats.norm.rvs(loc=-2, scale=1)
Y = stats.bernoulli.rvs(p=1/(1+np.exp(-X)))
if Y == 0:
Z = stats.expon.rvs(scale=np.exp(-X)) # note: np.exp(-X) could be cached for more computational efficiency but would render the code less useful
elif Y == 1:
Z = stats.weibull_min.rvs(c=c, scale=np.exp(-X))
else:
assert False
output.append((X, Y, Z))
return pd.DataFrame(output, columns=["Personal information", "Treatment", "Time to event"])
data = get_data_XYZ_one_dimensional(n=100, random_state=0)
print(data)
# The Architecture of CGAN
class cGAN():
"""
Class containing 3 methods (and __init__): generator, discriminator and train.
Generator is trained using random noise and label as inputs. Discriminator is trained
using real/fake samples and labels as inputs.
"""
def __init__(self,latent_dim=100, out_shape=3):
self.latent_dim = latent_dim
self.out_shape = out_shape
self.num_classes = 2
# using Adam as our optimizer
optimizer = Adam(0.0002, 0.5)
# building the discriminator
self.discriminator = self.discriminator()
self.discriminator.compile(loss=['binary_crossentropy'],
optimizer=optimizer,
metrics=['accuracy'])
# building the generator
self.generator = self.generator()
noise = Input(shape=(self.latent_dim,))
label = Input(shape=(1,))
gen_samples = self.generator([noise, label])
# we don't train discriminator when training generator
self.discriminator.trainable = False
valid = self.discriminator([gen_samples, label])
# combining both models
self.combined = Model([noise, label], valid)
self.combined.compile(loss=['binary_crossentropy'],
optimizer=optimizer,
metrics=['accuracy'])
def generator(self):
init = RandomNormal(mean=0.0, stddev=0.02)
model = Sequential()
model.add(Dense(128, input_dim=self.latent_dim))
model.add(Dropout(0.2))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(256))
model.add(Dropout(0.2))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(512))
model.add(Dropout(0.2))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(self.out_shape, activation='tanh'))
noise = Input(shape=(self.latent_dim,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
model_input = multiply([noise, label_embedding])
gen_sample = model(model_input)
model.summary()
return Model([noise, label], gen_sample, name="Generator")
def discriminator(self):
init = RandomNormal(mean=0.0, stddev=0.02)
model = Sequential()
model.add(Dense(512, input_dim=self.out_shape, kernel_initializer=init))
model.add(LeakyReLU(alpha=0.2))
model.add(Dense(256, kernel_initializer=init))
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.4))
model.add(Dense(128, kernel_initializer=init))
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
gen_sample = Input(shape=(self.out_shape,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(self.num_classes, self.out_shape)(label))
model_input = multiply([gen_sample, label_embedding])
validity = model(model_input)
model.summary()
return Model(inputs=[gen_sample, label], outputs=validity, name="Discriminator")
def train(self, X_train, y_train, pos_index, neg_index, epochs, sampling=False, batch_size=32, sample_interval=100, plot=True):
# though not recommended, defining losses as global helps as in analysing our cgan out of the class
global G_losses
global D_losses
G_losses = []
D_losses = []
# Adversarial ground truths
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))
for epoch in range(epochs):
# if sampling==True --> train discriminator with 8 sample from positive class and rest with negative class
if sampling:
idx1 = np.random.choice(pos_index, 3)
idx0 = np.random.choice(neg_index, batch_size-3)
idx = np.concatenate((idx1, idx0))
# if sampling!=True --> train discriminator using random instances in batches of 32
else:
idx = np.random.choice(len(y_train), batch_size)
samples, labels = X_train[idx], y_train[idx]
samples, labels = shuffle(samples, labels)
# Sample noise as generator input
noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
gen_samples = self.generator.predict([noise, labels])
# label smoothing
if epoch < epochs//1.5:
valid_smooth = (valid+0.1)-(np.random.random(valid.shape)*0.1)
fake_smooth = (fake-0.1)+(np.random.random(fake.shape)*0.1)
else:
valid_smooth = valid
fake_smooth = fake
# Train the discriminator
self.discriminator.trainable = True
d_loss_real = self.discriminator.train_on_batch([samples, labels], valid_smooth)
d_loss_fake = self.discriminator.train_on_batch([gen_samples, labels], fake_smooth)
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
# Train Generator
self.discriminator.trainable = False
sampled_labels = np.random.randint(0, 2, batch_size).reshape(-1, 1)
# Train the generator
g_loss = self.combined.train_on_batch([noise, sampled_labels], valid)
if (epoch+1)%sample_interval==0:
print('[%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f'
% (epoch, epochs, d_loss[0], g_loss[0]))
G_losses.append(g_loss[0])
D_losses.append(d_loss[0])
if plot:
if epoch+1==epochs:
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()
data.Treatment.value_counts()
scaler = StandardScaler()
X = scaler.fit_transform(data.drop('Treatment', 1))
y = data['Treatment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lgb_1 = lgb.LGBMClassifier()
lgb_1.fit(X_train, y_train)
y_pred = lgb_1.predict(X_test)
# evaluation
print(classification_report(y_test, y_pred))
plot_confusion_matrix(lgb_1, X_test, y_test)
plt.show()
le = preprocessing.LabelEncoder()
for i in ['Personal information', 'Treatment', 'Time to event']:
data[i] = le.fit_transform(data[i].astype(str))
y_train = y_train.reshape(-1,1)
pos_index = np.where(y_train==1)[0]
neg_index = np.where(y_train==0)[0]
cgan.train(X_train, y_train, pos_index, neg_index, epochs=500)
Here, the training gives an error ValueError: Input 0 of layer "Discriminator" is incompatible with the layer: expected shape=(None, 3), found shape=(100, 2). Well I understand I have to fix the shape by changing the input but where and how to do it.
Also there are 3 columns in data so how to go about making this work?
I think the fix out_shape=2 and not 3 because the generated output has 2 and you stated the number of classes to be 2 as well. Unless there is something else I am missing.
def __init__(self, latent_dim=100, out_shape=2):

400% higher error with PyTorch compared with identical Keras model (with Adam optimizer)

TLDR:
A simple (single hidden-layer) feed-forward Pytorch model trained to predict the function y = sin(X1) + sin(X2) + ... sin(X10) substantially underperforms an identical model built/trained with Keras. Why is this so and what can be done to mitigate the difference in performance?
In training a regression model, I noticed that PyTorch drastically underperforms an identical model built with Keras.
This phenomenon has been observed and reported previously:
The same model produces worse results on pytorch than on tensorflow
CNN model in pytorch giving 30% less accuracy to Tensoflowflow model:
PyTorch Adam vs Tensorflow Adam
Suboptimal convergence when compared with TensorFlow model
RNN and Adam: slower convergence than Keras
PyTorch comparable but worse than keras on a simple feed forward network
Why is the PyTorch model doing worse than the same model in Keras even with the same weight initialization?
Why Keras behave better than Pytorch under the same network configuration?
The following explanations and suggestions have been made previously as well:
Using the same decimal precision (32 vs 64): 1, 2,
Using a CPU instead of a GPU: 1,2
Change retain_graph=True to create_graph=True in computing the 2nd derivative with autograd.grad: 1
Check if keras is using a regularizer, constraint, bias, or loss function in a different way from pytorch: 1,2
Ensure you are computing the validation loss in the same way: 1
Use the same initialization routine: 1,2
Training the pytorch model for longer epochs: 1
Trying several random seeds: 1
Ensure that model.eval() is called in validation step when training pytorch model: 1
The main issue is with the Adam optimizer, not the initialization: 1
To understand this issue, I trained a simple two-layer neural network (much simpler than my original model) in Keras and PyTorch, using the same hyperparameters and initialization routines, and following all the recommendations listed above. However, the PyTorch model results in a mean squared error (MSE) that is 400% higher than the MSE of the Keras model.
Here is my code:
0. Imports
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
1. Generate a reproducible dataset
def get_data():
np.random.seed(0)
Xtrain = np.random.normal(0, 1, size=(7000,10))
Xval = np.random.normal(0, 1, size=(700,10))
ytrain = np.sum(np.sin(Xtrain), axis=-1)
yval = np.sum(np.sin(Xval), axis=-1)
scaler = MinMaxScaler()
ytrain = scaler.fit_transform(ytrain.reshape(-1,1)).reshape(-1)
yval = scaler.transform(yval.reshape(-1,1)).reshape(-1)
return Xtrain, Xval, ytrain, yval
class XYData(Dataset):
def __init__(self, X, y):
super(XYData, self).__init__()
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.float32)
self.len = len(y)
def __getitem__(self, index):
return (self.X[index], self.y[index])
def __len__(self):
return self.len
# Data, dataset, and dataloader
Xtrain, Xval, ytrain, yval = get_data()
traindata = XYData(Xtrain, ytrain)
valdata = XYData(Xval, yval)
trainloader = DataLoader(dataset=traindata, shuffle=True, batch_size=32, drop_last=False)
valloader = DataLoader(dataset=valdata, shuffle=True, batch_size=32, drop_last=False)
2. Build Keras and PyTorch models with identical hyperparameters and initialization methods
class TorchLinearModel(nn.Module):
def __init__(self, input_dim=10, random_seed=0):
super(TorchLinearModel, self).__init__()
_ = torch.manual_seed(random_seed)
self.hidden_layer = nn.Linear(input_dim,100)
self.initialize_layer(self.hidden_layer)
self.output_layer = nn.Linear(100, 1)
self.initialize_layer(self.output_layer)
def initialize_layer(self, layer):
_ = torch.nn.init.xavier_normal_(layer.weight)
#_ = torch.nn.init.xavier_uniform_(layer.weight)
_ = torch.nn.init.constant(layer.bias,0)
def forward(self, x):
x = self.hidden_layer(x)
x = self.output_layer(x)
return x
def mean_squared_error(ytrue, ypred):
return torch.mean(((ytrue - ypred) ** 2))
def build_torch_model():
torch_model = TorchLinearModel()
optimizer = optim.Adam(torch_model.parameters(),
betas=(0.9,0.9999),
eps=1e-7,
lr=1e-3,
weight_decay=0)
return torch_model, optimizer
def build_keras_model():
x = layers.Input(shape=10)
z = layers.Dense(units=100, activation=None, use_bias=True, kernel_regularizer=None,
bias_regularizer=None)(x)
y = layers.Dense(units=1, activation=None, use_bias=True, kernel_regularizer=None,
bias_regularizer=None)(z)
keras_model = Model(x, y, name='linear')
optimizer = Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.9999, epsilon=1e-7,
amsgrad=False)
keras_model.compile(optimizer=optimizer, loss='mean_squared_error')
return keras_model
# Instantiate models
torch_model, optimizer = build_torch_model()
keras_model = build_keras_model()
3. Train PyTorch model for 100 epochs:
torch_trainlosses, torch_vallosses = [], []
for epoch in range(100):
# Training
losses = []
_ = torch_model.train()
for i, (x,y) in enumerate(trainloader):
optimizer.zero_grad()
ypred = torch_model(x)
loss = mean_squared_error(y, ypred)
_ = loss.backward()
_ = optimizer.step()
losses.append(loss.item())
torch_trainlosses.append(np.mean(losses))
# Validation
losses = []
_ = torch_model.eval()
with torch.no_grad():
for i, (x, y) in enumerate(valloader):
ypred = torch_model(x)
loss = mean_squared_error(y, ypred)
losses.append(loss.item())
torch_vallosses.append(np.mean(losses))
print(f"epoch={epoch+1}, train_loss={torch_trainlosses[-1]:.4f}, val_loss={torch_vallosses[-1]:.4f}")
4. Train Keras model for 100 epochs:
history = keras_model.fit(Xtrain, ytrain, sample_weight=None, batch_size=32, epochs=100,
validation_data=(Xval, yval))
5. Loss in training history
plt.plot(torch_trainlosses, color='blue', label='PyTorch Train')
plt.plot(torch_vallosses, color='blue', linestyle='--', label='PyTorch Val')
plt.plot(history.history['loss'], color='brown', label='Keras Train')
plt.plot(history.history['val_loss'], color='brown', linestyle='--', label='Keras Val')
plt.legend()
Keras records a much lower error in the training. Since this may be due to a difference in how Keras computes the loss, I calculated the prediction error on the validation set with sklearn.metrics.mean_squared_error
6. Validation error after training
ypred_keras = keras_model.predict(Xval).reshape(-1)
ypred_torch = torch_model(torch.tensor(Xval, dtype=torch.float32))
ypred_torch = ypred_torch.detach().numpy().reshape(-1)
mse_keras = metrics.mean_squared_error(yval, ypred_keras)
mse_torch = metrics.mean_squared_error(yval, ypred_torch)
print('Percent error difference:', (mse_torch / mse_keras - 1) * 100)
r_keras = pearsonr(yval, ypred_keras)[0]
r_pytorch = pearsonr(yval, ypred_torch)[0]
print("r_keras:", r_keras)
print("r_pytorch:", r_pytorch)
plt.scatter(ypred_keras, yval); plt.title('Keras'); plt.show(); plt.close()
plt.scatter(ypred_torch, yval); plt.title('Pytorch'); plt.show(); plt.close()
Percent error difference: 479.1312469426776
r_keras: 0.9115184443702814
r_pytorch: 0.21728812737220082
The correlation of predicted values with ground truth is 0.912 for Keras but 0.217 for Pytorch, and the error for Pytorch is 479% higher!
7. Other trials
I also tried:
Lowering the learning rate for Pytorch (lr=1e-4), R increases from 0.217 to 0.576, but it's still much worse than Keras (r=0.912).
Increasing the learning rate for Pytorch (lr=1e-2), R is worse at 0.095
Training numerous times with different random seeds. The performance is roughly the same, regardless.
Trained for longer than 100 epochs. No improvement was observed!
Used torch.nn.init.xavier_uniform_ instead of torch.nn.init.xavier_normal_ in the initialization of the weights. R improves from 0.217 to 0.639, but it's still worse than Keras (0.912).
What can be done to ensure that the PyTorch model converges to a reasonable error comparable with the Keras model?
The problem here is unintentional broadcasting in the PyTorch training loop.
The result of a nn.Linear operation always has shape [B,D], where B is the batch size and D is the output dimension. Therefore, in your mean_squared_error function ypred has shape [32,1] and ytrue has shape [32]. By the broadcasting rules used by NumPy and PyTorch this means that ytrue - ypred has shape [32,32]. What you almost certainly meant is for ypred to have shape [32]. This can be accomplished in many ways; probably the most readable is to use Tensor.flatten
class TorchLinearModel(nn.Module):
...
def forward(self, x):
x = self.hidden_layer(x)
x = self.output_layer(x)
return x.flatten()
which produces the following train/val curves

How can I compile batched training of a gpflow GPR into a tf.function?

I need to train a GPR model in multiple batches per epoch using a custom loss function. I would like to do this using GPflow and I would like to compile my training using tf.function to increase the efficiency. However, gpflow.GPR must be re-instantiated each time you supply new data, so tf.function will have to re-trace each time. This makes the code slower rather than faster.
This is the initial setup:
import numpy as np
from itertools import islice
import tensorflow as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
from sklearn.model_selection import train_test_split
import gpflow
from gpflow.kernels import SquaredExponential
import time
data_size = 1000
train_fract = 0.8
batch_size = 250
n_epochs = 3
iterations_per_epoch = int(train_fract * data_size/batch_size)
tf.random.set_seed(3)
# Generate dummy data
x = np.arange(data_size)
y = np.arange(data_size) + np.random.rand(data_size)
# Slice into train and validate sets
x_train, x_validate, y_train, y_validate = train_test_split(x, y, random_state = 1, test_size = 1-train_fract )
# Convert data into tensorflow constants
x_train = tf.constant(x_train[:, np.newaxis], dtype=np.float64)
x_validate = tf.constant(x_validate[:, np.newaxis], dtype=np.float64)
y_train = tf.constant(y_train[:, np.newaxis], dtype=np.float64)
y_validate = tf.constant(y_validate[:, np.newaxis], dtype=np.float64)
# Batch data
batched_dataset = (
tf.data.Dataset.from_tensor_slices((x_train, y_train))
.shuffle(buffer_size=len(x_train), seed=1)
.repeat(count=None)
.batch(batch_size)
)
# Create kernel
constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())
amplitude = tfp.util.TransformedVariable(initial_value=1, bijector=constrain_positive, dtype=np.float64, name="amplitude")
len_scale = tfp.util.TransformedVariable(initial_value=10, bijector=constrain_positive, dtype=np.float64, name="len_scale")
kernel = SquaredExponential(variance=amplitude, lengthscales=len_scale, name="squared_exponential_kernel")
obs_noise = tfp.util.TransformedVariable(initial_value=1e-3, bijector=constrain_positive, dtype=np.float64, name="observation_noise")
# Define custom loss function
#tf.function(autograph=False, experimental_compile=False)
def my_custom_loss(y_predict, y_true):
return tf.math.reduce_mean(tf.math.squared_difference(y_predict, y_true))
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
This is how I train without a tf.function:
gpr_model_j_i = gpflow.models.GPR(data=(x_train, y_train), kernel=kernel, noise_variance=obs_noise)
# Start training loop
for j in range(n_epochs):
for i, (x_train_j_i, y_train_j_i) in enumerate(islice(batched_dataset, iterations_per_epoch)):
with tf.GradientTape() as tape:
gpr_model_j_i = gpflow.models.GPR(data=(x_train_j_i, y_train_j_i), kernel=kernel, noise_variance=gpr_model_j_i.likelihood.variance)
y_predict_j_i = gpr_model_j_i.predict_f(x_validate)[0]
loss_j_i = my_custom_loss(y_predict_j_i, y_validate)
grads_j_i = tape.gradient(loss_j_i, gpr_model_j_i.trainable_variables)
optimizer.apply_gradients(zip(grads_j_i, gpr_model_j_i.trainable_variables))
This is how I train with a tf.function:
#tf.function(autograph=False, experimental_compile=False)
def tf_function_attempt_3(model): #, optimizer):
with tf.GradientTape() as tape:
y_predict_j_i = model.predict_f(x_validate)[0]
loss_j_i = my_custom_loss(y_predict_j_i, y_validate)
grads_j_i = tape.gradient(loss_j_i, model.trainable_variables)
optimizer.apply_gradients(zip(grads_j_i, model.trainable_variables))
print("TRACING...", end="")
for j in range(n_epochs):
for i, (x_train_j_i, y_train_j_i) in enumerate(islice(batched_dataset, iterations_per_epoch)):
gpr_model_j_i = gpflow.models.GPR(data=(x_train_j_i, y_train_j_i), kernel=kernel, noise_variance=gpr_model_j_i.likelihood.variance)
tf_function_attempt_3(gpr_model_j_i)#, optimizer)
The tf.function retraces for each batch and is significantly slower than the normal training.
Is there a way to speed up the batched training of my GPR model with tf.function while using a custom loss function and GPflow? If not, I am open to suggestions for an alternative approach.
You don't have to re-instantiate GPR each time. You can construct tf.Variable holders with unconstrained shape and then .assign to them:
import gpflow
import numpy as np
import tensorflow as tf
input_dim = 1
initial_x, initial_y = np.zeros((0, input_dim)), np.zeros((0, 1)) # or your first batch
x_var = tf.Variable(initial_x, shape=(None, input_dim), dtype=tf.float64)
y_var = tf.Variable(initial_y, shape=(None,1), dtype=tf.float64)
# in principle you could also set shape=(None, None)...
m = gpflow.models.GPR((x_var, y_var), gpflow.kernels.SquaredExponential())
loss = m.training_loss_closure() # compile=True default wraps in tf.function()
N1 = 3
x1, y1 = np.random.randn(N1, input_dim), np.random.randn(N1, 1)
m.data[0].assign(x1)
m.data[1].assign(y1)
loss() # traces the first time
N2 = 7
x2, y2 = np.random.randn(N2, input_dim), np.random.randn(N2, 1)
m.data[0].assign(x2)
m.data[1].assign(y2)
loss() # does not trace again

Keras Loss Function with Additional Dynamic Parameter

I'm working on implementing prioritized experience replay for a deep-q network, and part of the specification is to multiply gradients by what's know as importance sampling (IS) weights. The gradient modification is discussed in section 3.4 of the following paper: https://arxiv.org/pdf/1511.05952.pdf I'm struggling with creating a custom loss function that takes in an array of IS weights in addition to y_true and y_pred.
Here's a simplified version of my model:
import numpy as np
import tensorflow as tf
# Input is RAM, each byte in the range of [0, 255].
in_obs = tf.keras.layers.Input(shape=(4,))
# Normalize the observation to the range of [0, 1].
norm = tf.keras.layers.Lambda(lambda x: x / 255.0)(in_obs)
# Hidden layers.
dense1 = tf.keras.layers.Dense(128, activation="relu")(norm)
dense2 = tf.keras.layers.Dense(128, activation="relu")(dense1)
dense3 = tf.keras.layers.Dense(128, activation="relu")(dense2)
dense4 = tf.keras.layers.Dense(128, activation="relu")(dense3)
# Output prediction, which is an action to take.
out_pred = tf.keras.layers.Dense(2, activation="linear")(dense4)
opt = tf.keras.optimizers.Adam(lr=5e-5)
network = tf.keras.models.Model(inputs=in_obs, outputs=out_pred)
network.compile(optimizer=opt, loss=huber_loss_mean_weighted)
Here's my custom loss function, which is just an implementation of Huber Loss multiplied by the IS weights:
'''
' Huber loss: https://en.wikipedia.org/wiki/Huber_loss
'''
def huber_loss(y_true, y_pred):
error = y_true - y_pred
cond = tf.keras.backend.abs(error) < 1.0
squared_loss = 0.5 * tf.keras.backend.square(error)
linear_loss = tf.keras.backend.abs(error) - 0.5
return tf.where(cond, squared_loss, linear_loss)
'''
' Importance Sampling weighted huber loss.
'''
def huber_loss_mean_weighted(y_true, y_pred, is_weights):
error = huber_loss(y_true, y_pred)
return tf.keras.backend.mean(error * is_weights)
The important bit is that is_weights is dynamic, i.e. it's different each time fit() is called. As such, I cannot simply close over is_weights as described here: Make a custom loss function in keras
I found this code online, which appears to use a Lambda layer to compute the loss: https://github.com/keras-team/keras/blob/master/examples/image_ocr.py#L475 It looks promising, but I'm struggling to understand it/adapt it to my particular problem. Any help is appreciated.
OK. Here is an example.
from keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from keras.models import Model
from keras.losses import categorical_crossentropy
def sample_loss( y_true, y_pred, is_weight ) :
return is_weight * categorical_crossentropy( y_true, y_pred )
x = Input(shape=(32,32,3), name='image_in')
y_true = Input( shape=(10,), name='y_true' )
is_weight = Input(shape=(1,), name='is_weight')
f = Conv2D(16,(3,3),padding='same')(x)
f = MaxPool2D((2,2),padding='same')(f)
f = Conv2D(32,(3,3),padding='same')(f)
f = MaxPool2D((2,2),padding='same')(f)
f = Conv2D(64,(3,3),padding='same')(f)
f = MaxPool2D((2,2),padding='same')(f)
f = Flatten()(f)
y_pred = Dense(10, activation='softmax', name='y_pred' )(f)
model = Model( inputs=[x, y_true, is_weight], outputs=y_pred, name='train_only' )
model.add_loss( sample_loss( y_true, y_pred, is_weight ) )
model.compile( loss=None, optimizer='sgd' )
print model.summary()
Note, since you've add loss through add_loss(), you don't have to do it through compile( loss=xxx ).
With regards to train a model, nothing is special except you move y_true to your input end. See below
import numpy as np
a = np.random.randn(8,32,32,3)
a_true = np.random.randn(8,10)
a_is_weight = np.random.randint(0,2,size=(8,1))
model.fit( [a, a_true, a_is_weight] )
Finally, you can make a testing model (which share all weights in model) for easier use, i.e.
test_model = Model( inputs=x, outputs=y_pred, name='test_only' )
a_pred = test_model.predict( a )