400% higher error with PyTorch compared with identical Keras model (with Adam optimizer) - tensorflow

TLDR:
A simple (single hidden-layer) feed-forward Pytorch model trained to predict the function y = sin(X1) + sin(X2) + ... sin(X10) substantially underperforms an identical model built/trained with Keras. Why is this so and what can be done to mitigate the difference in performance?
In training a regression model, I noticed that PyTorch drastically underperforms an identical model built with Keras.
This phenomenon has been observed and reported previously:
The same model produces worse results on pytorch than on tensorflow
CNN model in pytorch giving 30% less accuracy to Tensoflowflow model:
PyTorch Adam vs Tensorflow Adam
Suboptimal convergence when compared with TensorFlow model
RNN and Adam: slower convergence than Keras
PyTorch comparable but worse than keras on a simple feed forward network
Why is the PyTorch model doing worse than the same model in Keras even with the same weight initialization?
Why Keras behave better than Pytorch under the same network configuration?
The following explanations and suggestions have been made previously as well:
Using the same decimal precision (32 vs 64): 1, 2,
Using a CPU instead of a GPU: 1,2
Change retain_graph=True to create_graph=True in computing the 2nd derivative with autograd.grad: 1
Check if keras is using a regularizer, constraint, bias, or loss function in a different way from pytorch: 1,2
Ensure you are computing the validation loss in the same way: 1
Use the same initialization routine: 1,2
Training the pytorch model for longer epochs: 1
Trying several random seeds: 1
Ensure that model.eval() is called in validation step when training pytorch model: 1
The main issue is with the Adam optimizer, not the initialization: 1
To understand this issue, I trained a simple two-layer neural network (much simpler than my original model) in Keras and PyTorch, using the same hyperparameters and initialization routines, and following all the recommendations listed above. However, the PyTorch model results in a mean squared error (MSE) that is 400% higher than the MSE of the Keras model.
Here is my code:
0. Imports
import numpy as np
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
1. Generate a reproducible dataset
def get_data():
np.random.seed(0)
Xtrain = np.random.normal(0, 1, size=(7000,10))
Xval = np.random.normal(0, 1, size=(700,10))
ytrain = np.sum(np.sin(Xtrain), axis=-1)
yval = np.sum(np.sin(Xval), axis=-1)
scaler = MinMaxScaler()
ytrain = scaler.fit_transform(ytrain.reshape(-1,1)).reshape(-1)
yval = scaler.transform(yval.reshape(-1,1)).reshape(-1)
return Xtrain, Xval, ytrain, yval
class XYData(Dataset):
def __init__(self, X, y):
super(XYData, self).__init__()
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.float32)
self.len = len(y)
def __getitem__(self, index):
return (self.X[index], self.y[index])
def __len__(self):
return self.len
# Data, dataset, and dataloader
Xtrain, Xval, ytrain, yval = get_data()
traindata = XYData(Xtrain, ytrain)
valdata = XYData(Xval, yval)
trainloader = DataLoader(dataset=traindata, shuffle=True, batch_size=32, drop_last=False)
valloader = DataLoader(dataset=valdata, shuffle=True, batch_size=32, drop_last=False)
2. Build Keras and PyTorch models with identical hyperparameters and initialization methods
class TorchLinearModel(nn.Module):
def __init__(self, input_dim=10, random_seed=0):
super(TorchLinearModel, self).__init__()
_ = torch.manual_seed(random_seed)
self.hidden_layer = nn.Linear(input_dim,100)
self.initialize_layer(self.hidden_layer)
self.output_layer = nn.Linear(100, 1)
self.initialize_layer(self.output_layer)
def initialize_layer(self, layer):
_ = torch.nn.init.xavier_normal_(layer.weight)
#_ = torch.nn.init.xavier_uniform_(layer.weight)
_ = torch.nn.init.constant(layer.bias,0)
def forward(self, x):
x = self.hidden_layer(x)
x = self.output_layer(x)
return x
def mean_squared_error(ytrue, ypred):
return torch.mean(((ytrue - ypred) ** 2))
def build_torch_model():
torch_model = TorchLinearModel()
optimizer = optim.Adam(torch_model.parameters(),
betas=(0.9,0.9999),
eps=1e-7,
lr=1e-3,
weight_decay=0)
return torch_model, optimizer
def build_keras_model():
x = layers.Input(shape=10)
z = layers.Dense(units=100, activation=None, use_bias=True, kernel_regularizer=None,
bias_regularizer=None)(x)
y = layers.Dense(units=1, activation=None, use_bias=True, kernel_regularizer=None,
bias_regularizer=None)(z)
keras_model = Model(x, y, name='linear')
optimizer = Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.9999, epsilon=1e-7,
amsgrad=False)
keras_model.compile(optimizer=optimizer, loss='mean_squared_error')
return keras_model
# Instantiate models
torch_model, optimizer = build_torch_model()
keras_model = build_keras_model()
3. Train PyTorch model for 100 epochs:
torch_trainlosses, torch_vallosses = [], []
for epoch in range(100):
# Training
losses = []
_ = torch_model.train()
for i, (x,y) in enumerate(trainloader):
optimizer.zero_grad()
ypred = torch_model(x)
loss = mean_squared_error(y, ypred)
_ = loss.backward()
_ = optimizer.step()
losses.append(loss.item())
torch_trainlosses.append(np.mean(losses))
# Validation
losses = []
_ = torch_model.eval()
with torch.no_grad():
for i, (x, y) in enumerate(valloader):
ypred = torch_model(x)
loss = mean_squared_error(y, ypred)
losses.append(loss.item())
torch_vallosses.append(np.mean(losses))
print(f"epoch={epoch+1}, train_loss={torch_trainlosses[-1]:.4f}, val_loss={torch_vallosses[-1]:.4f}")
4. Train Keras model for 100 epochs:
history = keras_model.fit(Xtrain, ytrain, sample_weight=None, batch_size=32, epochs=100,
validation_data=(Xval, yval))
5. Loss in training history
plt.plot(torch_trainlosses, color='blue', label='PyTorch Train')
plt.plot(torch_vallosses, color='blue', linestyle='--', label='PyTorch Val')
plt.plot(history.history['loss'], color='brown', label='Keras Train')
plt.plot(history.history['val_loss'], color='brown', linestyle='--', label='Keras Val')
plt.legend()
Keras records a much lower error in the training. Since this may be due to a difference in how Keras computes the loss, I calculated the prediction error on the validation set with sklearn.metrics.mean_squared_error
6. Validation error after training
ypred_keras = keras_model.predict(Xval).reshape(-1)
ypred_torch = torch_model(torch.tensor(Xval, dtype=torch.float32))
ypred_torch = ypred_torch.detach().numpy().reshape(-1)
mse_keras = metrics.mean_squared_error(yval, ypred_keras)
mse_torch = metrics.mean_squared_error(yval, ypred_torch)
print('Percent error difference:', (mse_torch / mse_keras - 1) * 100)
r_keras = pearsonr(yval, ypred_keras)[0]
r_pytorch = pearsonr(yval, ypred_torch)[0]
print("r_keras:", r_keras)
print("r_pytorch:", r_pytorch)
plt.scatter(ypred_keras, yval); plt.title('Keras'); plt.show(); plt.close()
plt.scatter(ypred_torch, yval); plt.title('Pytorch'); plt.show(); plt.close()
Percent error difference: 479.1312469426776
r_keras: 0.9115184443702814
r_pytorch: 0.21728812737220082
The correlation of predicted values with ground truth is 0.912 for Keras but 0.217 for Pytorch, and the error for Pytorch is 479% higher!
7. Other trials
I also tried:
Lowering the learning rate for Pytorch (lr=1e-4), R increases from 0.217 to 0.576, but it's still much worse than Keras (r=0.912).
Increasing the learning rate for Pytorch (lr=1e-2), R is worse at 0.095
Training numerous times with different random seeds. The performance is roughly the same, regardless.
Trained for longer than 100 epochs. No improvement was observed!
Used torch.nn.init.xavier_uniform_ instead of torch.nn.init.xavier_normal_ in the initialization of the weights. R improves from 0.217 to 0.639, but it's still worse than Keras (0.912).
What can be done to ensure that the PyTorch model converges to a reasonable error comparable with the Keras model?

The problem here is unintentional broadcasting in the PyTorch training loop.
The result of a nn.Linear operation always has shape [B,D], where B is the batch size and D is the output dimension. Therefore, in your mean_squared_error function ypred has shape [32,1] and ytrue has shape [32]. By the broadcasting rules used by NumPy and PyTorch this means that ytrue - ypred has shape [32,32]. What you almost certainly meant is for ypred to have shape [32]. This can be accomplished in many ways; probably the most readable is to use Tensor.flatten
class TorchLinearModel(nn.Module):
...
def forward(self, x):
x = self.hidden_layer(x)
x = self.output_layer(x)
return x.flatten()
which produces the following train/val curves

Related

How to train Tensorflow's pre trained BERT on MLM task? ( Use pre-trained model only in Tensorflow)

Before Anyone suggests pytorch and other things, I am looking specifically for Tensorflow + pretrained + MLM task only. I know, there are lots of blogs for PyTorch and lots of blogs for fine tuning ( Classification) on Tensorflow.
Coming to the problem, I got a language model which is English + LaTex where a text data can represent any text from Physics, Chemistry, MAths and Biology and any typical example can look something like this:
Link to OCR image
"Find the value of function x in the equation: \n \\( f(x)=\\left\\{\\begin{array}{ll}x^{2} & \\text { if } x<0 \\\\ 2 x & \\text { if } x \\geq 0\\end{array}\\right. \\)"
So my language model needs to understand \geq \\begin array \eng \left \right other than the English language and that is why I need to train an MLM first on pre-trained BERT or SciBERT to have both. So I went up digging the internet and found some tutorials:
MLM training on Tensorflow BUT from Scratch; I need pre-trained
MLM on pre-trained but in Pytorch; I need Tensorflow
Fine Tuning with Keras; It is for classification but I want MLM
I already have a fine tuning classification model. Some of the code is as follows:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased')
def regular_encode(texts, tokenizer, maxlen=maxlen):
enc_di = tokenizer.batch_encode_plus(texts, return_token_type_ids=False,padding='max_length',max_length=maxlen,truncation = True,)
return np.array(enc_di['input_ids'])
Xtrain_encoded = regular_encode(X_train.astype('str'), tokenizer, maxlen=maxlen)
ytrain_encoded = tf.keras.utils.to_categorical(y_train, num_classes=classes,dtype = 'int32')
def build_model(transformer, loss='categorical_crossentropy', max_len=maxlen, dense = 512, drop1 = 0.3, drop2 = 0.3):
input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
sequence_output = transformer(input_word_ids)[0]
cls_token = sequence_output[:, 0, :]
#Fine Tuning Model Start
x = tf.keras.layers.Dropout(drop1)(cls_token)
x = tf.keras.layers.Dense(512, activation='relu')(x)
x = tf.keras.layers.Dropout(drop2)(x)
out = tf.keras.layers.Dense(classes, activation='softmax')(x)
model = tf.keras.Model(inputs=input_word_ids, outputs=out)
return model
Only useful thing I could get was in the HuggingFace that
With the tight interoperability between TensorFlow and PyTorch models, you can even save the model and then reload it as a PyTorch model (or vice-versa)
from transformers import AutoModelForSequenceClassification
model.save_pretrained("my_imdb_model")
pytorch_model = AutoModelForSequenceClassification.from_pretrained("my_imdb_model", from_tf=True)
So maybe I could train a pytorch MLM and then load it as a tensorflow fine tuned classification model?
Is there any other way?
I think after some research, I found something. I don't know if it'll work but it uses transformer and Tensorflow with XLM . I think it'll work for BERT too.
PRETRAINED_MODEL = 'jplu/tf-xlm-roberta-large'
from tensorflow.keras.optimizers import Adam
import transformers
from transformers import TFAutoModelWithLMHead, AutoTokenizer
def create_mlm_model_and_optimizer():
with strategy.scope():
model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
return model, optimizer
mlm_model, optimizer = create_mlm_model_and_optimizer()
def define_mlm_loss_and_metrics():
with strategy.scope():
mlm_loss_object = masked_sparse_categorical_crossentropy
def compute_mlm_loss(labels, predictions):
per_example_loss = mlm_loss_object(labels, predictions)
loss = tf.nn.compute_average_loss(
per_example_loss, global_batch_size = global_batch_size)
return loss
train_mlm_loss_metric = tf.keras.metrics.Mean()
return compute_mlm_loss, train_mlm_loss_metric
def masked_sparse_categorical_crossentropy(y_true, y_pred):
y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -1))
y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1))
loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked,
y_pred_masked,
from_logits=True)
return loss
def train_mlm(train_dist_dataset, total_steps=2000, evaluate_every=200):
step = 0
### Training lopp ###
for tensor in train_dist_dataset:
distributed_mlm_train_step(tensor)
step+=1
if (step % evaluate_every == 0):
### Print train metrics ###
train_metric = train_mlm_loss_metric.result().numpy()
print("Step %d, train loss: %.2f" % (step, train_metric))
### Reset metrics ###
train_mlm_loss_metric.reset_states()
if step == total_steps:
break
#tf.function
def distributed_mlm_train_step(data):
strategy.experimental_run_v2(mlm_train_step, args=(data,))
#tf.function
def mlm_train_step(inputs):
features, labels = inputs
with tf.GradientTape() as tape:
predictions = mlm_model(features, training=True)[0]
loss = compute_mlm_loss(labels, predictions)
gradients = tape.gradient(loss, mlm_model.trainable_variables)
optimizer.apply_gradients(zip(gradients, mlm_model.trainable_variables))
train_mlm_loss_metric.update_state(loss)
compute_mlm_loss, train_mlm_loss_metric = define_mlm_loss_and_metrics()
Now train it as train_mlm(train_dist_dataset, TOTAL_STEPS, EVALUATE_EVERY)
Above ode is from this notebook and you need to do all the necessary things given exactly
The author says in the end that:
This fine tuned model can be loaded just as the original to build a classification model from it

Completely different results using Tensorflow and Pytorch for MobilenetV3 Small

I am using transfer learning from MobileNetV3 Small to predict 5 different points on an image. I am doing this as a regression task.
For both models:
Setting the last 50 layers trainable and adding the same fully connected layers to the end.
Learning rate 3e-2
Batch size 32
Adam optimizer with the same betas
100 epochs
The inputs consist of RGB unscaled images
Pytorch
Model
def _init_weights(m):
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)
def get_mob_v3_small():
model = torchvision.models.mobilenet_v3_small(pretrained=True)
children_list = get_children(model)
for c in children_list[:-50]:
for p in c.parameters():
p.requires_grad = False
return model
class TransferMobileNetV3_v2(nn.Module):
def __init__(self,
num_keypoints: int = 5):
super(TransferMobileNetV3_v2, self).__init__()
self.classifier_neurons = num_keypoints*2
self.base_model = get_mob_v3_small()
self.base_model.classifier = nn.Sequential(
nn.Linear(in_features=1024, out_features=1024),
nn.ReLU(),
nn.Linear(in_features=1024, out_features=512),
nn.ReLU(),
nn.Linear(in_features=512, out_features=self.classifier_neurons)
)
self.base_model.apply(_init_weights)
def forward(self, x):
out = self.base_model(x)
return out
Training Script
def train(net, trainloader, testloader, train_loss_fn, optimizer, scaler, args):
len_dataloader = len(trainloader)
for epoch in range(1, args.epochs+1):
net.train()
for batch_idx, sample in enumerate(trainloader):
inputs, labels = sample
inputs, labels = inputs.to(args.device), labels.to(args.device)
optimizer.zero_grad()
with torch.cuda.amp.autocast(args.use_amp):
prediction = net(inputs)
loss = train_loss_fn(prediction, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
def main():
args = make_args_parser()
args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = args.seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=3e-2,
betas=(0.9, 0.999))
scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
train(net, train_loader, test_loader, loss_fn, optimizer, scaler, args)
Tensorflow
Model
base_model = tf.keras.applications.MobileNetV3Small(weights='imagenet',
input_shape=(224,224,3))
x_in = base_model.layers[-6].output
x = Dense(units=1024, activation="relu")(x_in)
x = Dense(units=512, activation="relu")(x)
x = Dense(units=10, activation="linear")(x)
model = Model(inputs=base_model.input, outputs=x)
for layer in model.layers[:-50]:
layer.trainable=False
Training Script
model.compile(loss = "mse",
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2))
history = model.fit(input_numpy, output_numpy,
verbose=1,
batch_size=32, epochs=100,validation_split = 0.2)
Results
The PyTorch model predicts one single point around the center for all 5 different points.
The Tensorflow model predicts the points quite well and are quite accurate.
The loss in the Pytorch model is much higher than the Tensorflow model.
Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results. Please do let me know what is going wrong as I am trying my best to shift to PyTorch for this work and I need this model to give me similar/identical results.
Note: I also noticed that the MobileNetV3 Small model seems to be different in PyTorch and different in Tensorflow. I do not know if am interpreting it wrong, but I'm putting it here just in case.

How to apply Triplet Loss for a ResNet50 based Siamese Network in Keras or Tf 2

I have a ResNet based siamese network which uses the idea that you try to minimize the l-2 distance between 2 images and then apply a sigmoid so that it gives you {0:'same',1:'different'} output and based on how far the prediction is, you just flow the gradients back to network but there is a problem that updation of gradients is too little as we're changing the distance between {0,1} so I thought of using the same architecture but based on Triplet Loss.
I1 = Input(shape=image_shape)
I2 = Input(shape=image_shape)
res_m_1 = ResNet50(include_top=False, weights='imagenet', input_tensor=I1, pooling='avg')
res_m_2 = ResNet50(include_top=False, weights='imagenet', input_tensor=I2, pooling='avg')
x1 = res_m_1.output
x2 = res_m_2.output
# x = Flatten()(x) or use this one if not using any pooling layer
distance = Lambda( lambda tensors : K.abs( tensors[0] - tensors[1] )) ([x1,x2] )
final_output = Dense(1,activation='sigmoid')(distance)
siamese_model = Model(inputs=[I1,I2], outputs=final_output)
siamese_model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics['acc'])
siamese_model.fit_generator(train_gen,steps_per_epoch=1000,epochs=10,validation_data=validation_data)
So how can I change it to use the Triplet Loss function? What adjustments should be done here in order to get this done? One change will be that I'll have to calculate
res_m_3 = ResNet50(include_top=False, weights='imagenet', input_tensor=I2, pooling='avg')
x3 = res_m_3.output
One thing found in tf docs is triplet-semi-hard-loss and is given as:
tfa.losses.TripletSemiHardLoss()
As shown in the paper, the best results are from triplets known as "Semi-Hard". These are defined as triplets where the negative is farther from the anchor than the positive, but still produces a positive loss. To efficiently find these triplets we utilize online learning and only train from the Semi-Hard examples in each batch.
Another implementation of Triplet Loss which I found on Kaggle is: Triplet Loss Keras
Which one should I use and most importantly, HOW?
P.S: People also use something like: x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x) after model.output. Why is that? What is this doing?
Following this answer of mine, and with role of TripletSemiHardLoss in mind, we could do following:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
from tensorflow.keras import models, layers
BATCH_SIZE = 32
LATENT_DEM = 128
def _normalize_img(img, label):
img = tf.cast(img, tf.float32) / 255.
return (img, label)
train_dataset, test_dataset = tfds.load(name="mnist", split=['train', 'test'], as_supervised=True)
# Build your input pipelines
train_dataset = train_dataset.shuffle(1024).batch(BATCH_SIZE)
train_dataset = train_dataset.map(_normalize_img)
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(_normalize_img)
inputs = layers.Input(shape=(28, 28, 1))
resNet50 = tf.keras.applications.ResNet50(include_top=False, weights=None, input_tensor=inputs, pooling='avg')
outputs = layers.Dense(LATENT_DEM, activation=None)(resNet50.output) # No activation on final dense layer
outputs = layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(outputs) # L2 normalize embedding
siamese_model = models.Model(inputs=inputs, outputs=outputs)
# Compile the model
siamese_model.compile(
optimizer=tf.keras.optimizers.Adam(0.001),
loss=tfa.losses.TripletSemiHardLoss())
# Train the network
history = siamese_model.fit(
train_dataset,
epochs=3)

Loss function with derivative in TensorFlow 2

I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help
Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")

Tensorflow: why is there a diffrence in training a model and params directly

Below is a simple program I wrote. Its job is to learn the parameters of a simple linear function Ax+B.
When training "manually" it converges after a few thousand epochs, but when I try to do the same using a Dense(1) layer+model it converges to a loss of 500000, and the trained parameters are nowhere near the correct ones (-2, 34).
I thought Dense(1) layer is just like Ax+B, but it's not?
from tensorflow.keras import layers, models
from tensorflow.keras import optimizers
from tensorflow.keras import initializers
import tensorflow as tf
import random
import numpy as np
x_train = np.linspace(1, 100, 100)
y_train = -2*x_train+34
def manual_train():
optimizer = optimizers.Adam(lr=1e-2)
vars = [tf.Variable(random.random(), trainable=True) for i in range(2)]
for epoch in range(1000000):
with tf.GradientTape() as tape:
y_pred = vars[0]*x_train + vars[1]
loss = tf.reduce_sum(tf.abs(y_train - y_pred))
model_gradients = tape.gradient(loss, vars)
optimizer.apply_gradients(zip(model_gradients, vars))
print(epoch, 'parameters', vars[0].numpy(), vars[1].numpy(), 'loss', loss.numpy())
def nn_train():
input_layer = layers.Input(shape=(1,))
output_layer = layers.Dense(1, kernel_initializer=initializers.RandomUniform(0,1), bias_initializer=initializers.RandomUniform(0,1))(input_layer)
model = models.Model(inputs=input_layer, outputs=output_layer)
optimizer = optimizers.Adam(lr=1e-2)
#model.compile(optimizer=optimizer, loss=None)
for epoch in range(1000000):
with tf.GradientTape() as tape:
y_pred = model(x_train.reshape((-1,1)))
loss = tf.reduce_sum(tf.abs(y_train - y_pred))
model_gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(model_gradients, model.trainable_variables))
print(epoch, 'parameters', model.trainable_variables[0].numpy(), model.trainable_variables[1].numpy(), 'loss', loss.numpy())
# uncomment one:
manual_train()
#nn_train()
You have a problem of shape in your keras example.
Because of that problem, the operation y_train - y_pred does not exactly what you think it does.
Because at that point, y_train has a shape of (100) and y_pred a shape of (100,1), when you do the subtraction, TensorFlow will do a "smart" broadcast, and the end result is an array of size (100,100)
Reshape your ground truth to the same shape as your output (or the other way around), to get the correct value of your loss:
loss = tf.reduce_sum(tf.abs(y_train.reshape((-1,1)) - y_pred))