Tensorflow 2 Metrics produce wrong results with 2 GPUs - tensorflow

I took this piece of code from tensorflow documentation about distributed training with custom loop https://www.tensorflow.org/tutorials/distribute/custom_training and I just fixed it to work with the tf.keras.metrics.AUC and run it with 2 GPUS (2 Nvidia V100 from a DGX machine).
# Import TensorFlow
import tensorflow as tf
# Helper libraries
import numpy as np
print(tf.__version__)
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
# Adding a dimension to the array -> new shape == (28, 28, 1)
# We are doing this because the first layer in our model is a convolutional
# layer and it requires a 4D input (batch_size, height, width, channels).
# batch_size dimension will be added later on.
train_images = train_images[..., None]
test_images = test_images[..., None]
# One hot
train_labels = tf.keras.utils.to_categorical(train_labels, 10)
test_labels = tf.keras.utils.to_categorical(test_labels, 10)
# Getting the images in [0, 1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)
# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor, it will be auto-detected.
GPUS = [0, 1]
devices = ["/gpu:" + str(gpu_id) for gpu_id in GPUS]
strategy = tf.distribute.MirroredStrategy(devices=devices)
print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))
BUFFER_SIZE = len(train_images)
BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
EPOCHS = 10
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE)
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)
def create_model():
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(64, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
with strategy.scope():
# Set reduction to `none` so we can do the reduction afterwards and divide by
# global batch size.
loss_object = tf.keras.losses.CategoricalCrossentropy(
from_logits=True,
reduction=tf.keras.losses.Reduction.NONE)
def compute_loss(labels, predictions):
per_example_loss = loss_object(labels, predictions)
return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)
with strategy.scope():
test_loss = tf.keras.metrics.Mean(name='test_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(
name='train_accuracy')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(
name='test_accuracy')
train_auc = tf.keras.metrics.AUC(name='train_auc')
test_auc = tf.keras.metrics.AUC(name='test_auc')
# model, optimizer, and checkpoint must be created under `strategy.scope`.
with strategy.scope():
model = create_model()
optimizer = tf.keras.optimizers.Adam()
def train_step(inputs):
images, labels = inputs
with tf.GradientTape() as tape:
predictions = model(images, training=True)
loss = compute_loss(labels, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_accuracy(labels, predictions)
train_auc(labels, predictions)
return loss
def test_step(inputs):
images, labels = inputs
predictions = model(images, training=False)
t_loss = loss_object(labels, predictions)
test_loss.update_state(t_loss)
test_accuracy(labels, predictions)
test_auc(labels, predictions)
# `run` replicates the provided computation and runs it
# with the distributed input.
#tf.function
def distributed_train_step(dataset_inputs):
per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
axis=None)
#tf.function
def distributed_test_step(dataset_inputs):
return strategy.run(test_step, args=(dataset_inputs,))
for epoch in range(EPOCHS):
# TRAIN LOOP
total_loss = 0.0
num_batches = 0
for x in train_dist_dataset:
total_loss += distributed_train_step(x)
num_batches += 1
train_loss = total_loss / num_batches
# TEST LOOP
for x in test_dist_dataset:
distributed_test_step(x)
template = ("Epoch {}, Loss: {}, Accuracy: {}, AUC: {},"
"Test Loss: {}, Test Accuracy: {}, Test AUC: {}")
print (template.format(epoch+1,
train_loss, train_accuracy.result()*100, train_auc.result()*100,
test_loss.result(), test_accuracy.result()*100, test_auc.result()*100))
test_loss.reset_states()
train_accuracy.reset_states()
test_accuracy.reset_states()
train_auc.reset_states()
test_auc.reset_states()
The problem is that AUC's evaluation is definitely wrong cause it exceeds its range (should be from 0-100) and i get theese results by running the above code for one time:
Epoch 1, Loss: 1.8061423301696777, Accuracy: 66.00833892822266, AUC: 321.8688659667969,Test Loss: 1.742477536201477, Test Accuracy: 72.0999984741211, Test AUC: 331.33709716796875
Epoch 2, Loss: 1.7129968404769897, Accuracy: 74.9816665649414, AUC: 337.37017822265625,Test Loss: 1.7084736824035645, Test Accuracy: 75.52999877929688, Test AUC: 337.1878967285156
Epoch 3, Loss: 1.643971562385559, Accuracy: 81.83333587646484, AUC: 355.96209716796875,Test Loss: 1.6072628498077393, Test Accuracy: 85.3499984741211, Test AUC: 370.603759765625
Epoch 4, Loss: 1.5887378454208374, Accuracy: 87.27833557128906, AUC: 373.6204528808594,Test Loss: 1.5906082391738892, Test Accuracy: 87.13999938964844, Test AUC: 371.9998474121094
Epoch 5, Loss: 1.581775426864624, Accuracy: 88.0, AUC: 373.9468994140625,Test Loss: 1.5964380502700806, Test Accuracy: 86.68000030517578, Test AUC: 371.0227355957031
Epoch 6, Loss: 1.5764907598495483, Accuracy: 88.49166870117188, AUC: 375.2404479980469,Test Loss: 1.5832056999206543, Test Accuracy: 87.94000244140625, Test AUC: 373.41998291015625
Epoch 7, Loss: 1.5698528289794922, Accuracy: 89.19166564941406, AUC: 376.473876953125,Test Loss: 1.5770654678344727, Test Accuracy: 88.58000183105469, Test AUC: 375.5516662597656
Epoch 8, Loss: 1.564456820487976, Accuracy: 89.71833801269531, AUC: 377.8564758300781,Test Loss: 1.5792100429534912, Test Accuracy: 88.27000427246094, Test AUC: 373.1791687011719
Epoch 9, Loss: 1.5612279176712036, Accuracy: 90.02000427246094, AUC: 377.9949645996094,Test Loss: 1.5729509592056274, Test Accuracy: 88.9800033569336, Test AUC: 375.5257263183594
Epoch 10, Loss: 1.5562015771865845, Accuracy: 90.54000091552734, AUC: 378.9789123535156,Test Loss: 1.56815767288208, Test Accuracy: 89.3499984741211, Test AUC: 375.8636474609375
Accuracy is ok but it seems that it's the only one metric that behaves nice. I tried other metrics too but they are not evaluated correctly. It seems that the problems come when using more than one GPU, cause when I run this code with one GPU it produce the right results.

When you use distributed strategy, the metric must be constructed and used inside the strategy.scope() block. So when you want to call the metric.result() method, remember to put it inside the with strategy.scope() block.
with strategy.scope():
print(metric.result())

Related

Keras multi target classification model test accuracy is less than train accuracy

I am building multi classification model, after fit method, getting test accuracy is very very less than train accuracy even drop outs layers attached.
The layers definition is as follows.
n_labels = len(unique(X_train_enc))
in_layer = Input(shape=(1,))
em_layer = Embedding(input_dim = int(n_labels)+1,output_dim = 7,
input_length = 1, name="embedding")(in_layer)
dense = Dense(512, activation='relu', kernel_initializer='he_normal')(em_layer)
dense = Dropout(0.2)(dense)
output = Dense(7, activation='softmax')(dense)
model = Model(inputs=in_layer, outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=100, verbose=1)
Epoch 98/100
572/572 [==============================] - 11s 19ms/step - loss: 3.7565e-11 - accuracy: 1.0000
Epoch 99/100
313/572 [===============>..............] - ETA: 5s - loss: 2.2852e-11 - accuracy: 1.0000
test_loss, test_acc = model.evaluate(X_test_enc, y_test_enc,verbose=1)
print('Test Accuracy: ', test_acc, '\nTest Loss: ', test_loss)
print('Accuracy: %.2f' % (test_acc*100))
766/766 [==============================] - 7s 9ms/step - loss: 20.4420 - accuracy: 0.1426
Test Accuracy: 0.1426003873348236
Test Loss: 20.441951751708984
Accuracy: 14.26
any tuning will be highly appreciated.

The interaction between two networks in a Tensorflow custom loss function

Assume you have two Tensorflow models model_A and model_B and the training loop looks something like this,
with tf.GradientTape() as tape:
output_A = model_A(input)
output_B = model_B(input)
loss = loss_fn(output_A, output_B, true_output_A, true_output_B)
grads = tape.gradient(loss, model_A.trainable_variables)
optimizer.apply_gradients(zip(grads, model_A.trainable_variables))
and define the loss function as,
def loss_fn(output_A, output_B, true_output_A, true_output_B)
loss = (output_A + output_B) - (true_output_A + true_output_B)
return loss
The loss function that is being used to update model_A has the output of another network (output_B). How does Tensorflow handle this situation?
Does it use the weights of model_B when computing the gradient? or does it deal with output_B as a constant and not try to trace its origins?
It won't use model_B weights, only model_A weights will be updated.
For example:
import tensorflow as tf
# Model1
cnnin = tf.keras.layers.Input(shape=(10, 10, 1))
x = tf.keras.layers.Conv2D(8, 4)(cnnin)
x = tf.keras.layers.Conv2D(16, 4)(x)
x = tf.keras.layers.Conv2D(32, 2)(x)
x = tf.keras.layers.Conv2D(64, 2)(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(4)(x)
x = tf.keras.layers.Dense(4, activation="relu")(x)
cnnout = tf.keras.layers.Dense(1, activation="linear")(x)
# Model 2
mlpin= tf.keras.layers.Input(shape=(10, 10, 1), name="mlp_input")
z= tf.keras.layers.Dense(4, activation="sigmoid")(mlpin)
z= tf.keras.layers.Dense(4, activation = "softmax")(z)
z = tf.keras.layers.Flatten()(z)
z = tf.keras.layers.Dense(4)(z)
mlpout = tf.keras.layers.Dense(1, activation="linear")(z)
Loss function
def loss_fn(output_A, output_B, true_output_A, true_output_B):
output_A = tf.reshape(output_A, [-1])
output_B = tf.reshape(output_B, [-1])
pred = tf.reduce_sum(output_A + output_B)
inputs = tf.reduce_sum(true_output_A+ true_output_B)
loss = inputs-pred
return loss
Customize what happens in Model.fit
loss_tracker = tf.keras.metrics.Mean(name = "custom_loss")
class TestModel(tf.keras.Model):
def __init__(self, model1, model2):
super(TestModel, self).__init__()
self.model1 = model1
self.model2 = model2
def compile(self, optimizer):
super(TestModel, self).compile()
self.optimizer = optimizer
def train_step(self, data):
x, (y1, y2) = data
with tf.GradientTape() as tape:
ypred1 = self.model1([x], training = True)
ypred2 = self.model2([x], training = True)
loss_value = loss_fn(ypred1, ypred2, y1,y2)
# Compute gradients
trainable_vars = self.model1.trainable_variables
gradients = tape.gradient(loss_value, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
loss_tracker.update_state(loss_value)
return {"loss": loss_tracker.result()}
Define model1 and model2 and save them, so you can check the weights after training
model1= tf.keras.models.Model(cnnin, cnnout, name="model1")
model2 = tf.keras.models.Model(mlpin, mlpout, name="model2")
model1.save('test_model1.h5')
model2.save('test_model2.h5')
import numpy as np
x = np.random.rand(6, 10,10,1)
y1 = np.random.rand(6,1)
y2 = np.random.rand(6,1)
trainable_model = TestModel(model1, model2)
trainable_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0001))
trainable_model.fit(x=x, y = (y1, y2), epochs=10)
Gives the following output:
Epoch 1/10
1/1 [==============================] - 0s 375ms/step - loss: 7.9465
Epoch 2/10
1/1 [==============================] - 0s 6ms/step - loss: 7.8509
Epoch 3/10
1/1 [==============================] - 0s 6ms/step - loss: 7.7547
Epoch 4/10
1/1 [==============================] - 0s 6ms/step - loss: 7.6577
Epoch 5/10
1/1 [==============================] - 0s 5ms/step - loss: 7.5600
Epoch 6/10
1/1 [==============================] - 0s 4ms/step - loss: 7.4608
Epoch 7/10
1/1 [==============================] - 0s 4ms/step - loss: 7.3574
Epoch 8/10
1/1 [==============================] - 0s 6ms/step - loss: 7.2514
Epoch 9/10
1/1 [==============================] - 0s 5ms/step - loss: 7.1429
Epoch 10/10
1/1 [==============================] - 0s 5ms/step - loss: 7.0323
Then load saved models and check the trainable_weights:
test_model1 = tf.keras.models.load_model('test_model1.h5')
test_model2 = tf.keras.models.load_model('test_model2.h5')
Compare model1 trainable_weights before and after training (they should all change):
model1_weights = [i for i in model1.trainable_weights]
for i in range(len(model1_weights)):
print(np.array_equal(model1.trainable_weights[i], test_model1.trainable_weights[i]))
Outputs:
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Compare model2 trainable_weights before and after training (they should all be the same):
model2_weights = [i for i in model2.trainable_weights]
for i in range(len(model2_weights)):
print(np.array_equal(model2.trainable_weights[i], test_model2.trainable_weights[i]))
Outputs:
True
True
True
True
True
True
True
True

Classification with PyTorch is much slower than Tensorflow: 42min vs. 11min

I have been a Tensorflow user and start to use Pytorch. As a trial, I implemented simple classification tasks with both libraries.
However, PyTorch is much slower than Tensorflow: Pytorch takes 42min while TensorFlow 11min. I referred to PyTorch official Tutorial, and made only little change from it.
Could anyone share some advice for this problem?
Here is the summary what I tried.
environment: Colab Pro+
dataset: Cifar10
classifier: VGG16
optimizer: Adam
loss: crossentropy
batch size: 32
PyTorch
Code:
import torch, torchvision
from torch import nn
from torchvision import transforms, models
from tqdm import tqdm
import time, copy
trans = transforms.Compose([transforms.Resize((224, 224)),
transforms.ToTensor(),])
data = {phase: torchvision.datasets.CIFAR10('./', train = (phase=='train'), transform=trans, download=True) for phase in ['train', 'test']}
dataloaders = {phase: torch.utils.data.DataLoader(data[phase], batch_size=32, shuffle=True) for phase in ['train', 'test']}
def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'test']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in tqdm(iter(dataloaders[phase])):
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase])
epoch_acc = running_corrects.double() / len(dataloaders[phase])
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'test' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = models.vgg16(pretrained=False)
model = model.to(device)
model = train_model(model=model,
criterion=nn.CrossEntropyLoss(),
optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
dataloaders=dataloaders,
device=device,
)
Result:
Epoch 0/4
----------
0%| | 0/1563 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at /pytorch/c10/core/TensorImpl.h:1156.)
return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
100%|██████████| 1563/1563 [07:50<00:00, 3.32it/s]
train Loss: 75.5199 Acc: 3.2809
100%|██████████| 313/313 [00:38<00:00, 8.11it/s]
test Loss: 73.7274 Acc: 3.1949
Epoch 1/4
----------
100%|██████████| 1563/1563 [07:50<00:00, 3.33it/s]
train Loss: 73.8162 Acc: 3.2514
100%|██████████| 313/313 [00:38<00:00, 8.13it/s]
test Loss: 73.6114 Acc: 3.1949
Epoch 2/4
----------
100%|██████████| 1563/1563 [07:49<00:00, 3.33it/s]
train Loss: 73.7741 Acc: 3.1369
100%|██████████| 313/313 [00:38<00:00, 8.11it/s]
test Loss: 73.5873 Acc: 3.1949
Epoch 3/4
----------
100%|██████████| 1563/1563 [07:49<00:00, 3.33it/s]
train Loss: 73.7493 Acc: 3.1331
100%|██████████| 313/313 [00:38<00:00, 8.12it/s]
test Loss: 73.6191 Acc: 3.1949
Epoch 4/4
----------
100%|██████████| 1563/1563 [07:49<00:00, 3.33it/s]
train Loss: 73.7289 Acc: 3.1939
100%|██████████| 313/313 [00:38<00:00, 8.13it/s]test Loss: 73.5955 Acc: 3.1949
Training complete in 42m 22s
Best val Acc: 3.194888
Tensorflow
Code:
import tensorflow_datasets as tfds
from tensorflow.keras import applications, models
import tensorflow as tf
import time
ds_test, ds_train = tfds.load('cifar10', split=['test', 'train'])
def resize(ip):
image = ip['image']
label = ip['label']
image = tf.image.resize(image, (224, 224))
image = tf.expand_dims(image,0)
label = tf.one_hot(label,10)
label = tf.expand_dims(label,0)
return (image, label)
ds_train_ = ds_train.map(resize)
ds_test_ = ds_test.map(resize)
model = applications.vgg16.VGG16(input_shape = (224, 224, 3), weights=None, classes=10)
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics= ['accuracy'])
batch_size = 32
since = time.time()
history = model.fit(ds_train_,
batch_size = batch_size,
steps_per_epoch = len(ds_train)//batch_size,
epochs = 5,
validation_steps = len(ds_test),
validation_data = ds_test_,
shuffle = True,)
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60 ))
Result:
Epoch 1/5
1562/1562 [==============================] - 125s 69ms/step - loss: 36.9022 - accuracy: 0.1069 - val_loss: 2.3031 - val_accuracy: 0.1000
Epoch 2/5
1562/1562 [==============================] - 129s 83ms/step - loss: 2.3031 - accuracy: 0.1005 - val_loss: 2.3033 - val_accuracy: 0.1000
Epoch 3/5
1562/1562 [==============================] - 129s 83ms/step - loss: 2.3035 - accuracy: 0.1069 - val_loss: 2.3031 - val_accuracy: 0.1000
Epoch 4/5
1562/1562 [==============================] - 129s 83ms/step - loss: 2.3038 - accuracy: 0.1024 - val_loss: 2.3030 - val_accuracy: 0.1000
Epoch 5/5
1562/1562 [==============================] - 129s 83ms/step - loss: 2.3028 - accuracy: 0.1024 - val_loss: 2.3033 - val_accuracy: 0.1000
Training complete in 11m 23s
It is because in your tensorflow codes, the data pipeline is feeding a batch of 1 image into the model per step instead of a batch of 32 images.
Passing batch_size into model.fit does not really control the batch size when the data is in the form of datasets. The reason why it showed a seemingly correct steps per epoch from the log is that you passed steps_per_epoch into model.fit.
To correctly set the batch size:
ds_test, ds_train = tfds.load('cifar10', split=['test', 'train'])
def resize(ip):
image = ip['image']
label = ip['label']
image = tf.image.resize(image, (224, 224))
label = tf.one_hot(label,10)
return (image, label)
train_size=len(ds_train)
test_size=len(ds_test)
ds_train_ = ds_train.shuffle(train_size).batch(32).map(resize)
ds_test_ = ds_test.shuffle(test_size).batch(32).map(resize)
model.fit call:
history = model.fit(ds_train_,
epochs = 1,
validation_data = ds_test_)
After fixed the problem, tensorflow got similar speed performance with pytorch. In my machine, pytorch took ~27 minutes per epoch while tensorflow took ~24 minutes per epoch.
According to the benchmarks from NVIDIA, pytorch and tensorflow had similar speed performance in most popular deep learning applications with real-world datasets and problem size. (Reference: https://developer.nvidia.com/deep-learning-performance-training-inference)

LSTM: loss value is not changing

I am working on predicting stock trend (up, or down).
Below is how I am handling my pre-processing.
index_ = len(df.columns) - 1
x = df.iloc[:,:index_]
x = x[['Relative_Volume', 'CurrentPrice', 'MarketCap']]
x = x.values.astype(float)
# x = x.reshape(len(x), 1, x.shape[1]).astype(float)
x = x.reshape(*x.shape, 1)
y = df.iloc[:,index_:].values.astype(float)
# x.shape = (44930, 3, 1)
# y.shape = (44930, 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=98 )
Then I am building my BILSTM model:
def build_nn():
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape = (x_train.shape[0], 1) , name="one")))
model.add(Dropout(0.20))
model.add(Bidirectional(LSTM(128, return_sequences=True , name="two")))
model.add(Dropout(0.20))
model.add(Bidirectional(LSTM(64, return_sequences=False , name="three")))
model.add(Dropout(0.20))
model.add(Dense(1,activation='sigmoid'))
# opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
opt = SGD(lr=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
filepath = "bilstmv1.h5"
chkp = ModelCheckpoint(monitor = 'val_accuracy', mode = 'auto', filepath=filepath, verbose = 1, save_best_only=True)
model = build_nn()
# model.summary()
model.fit(x_train, y_train,
epochs=3,
batch_size=256,
validation_split=0.1, callbacks=[chkp])
model.summary()
Below is the output of the loss_value:
Epoch 1/3
127/127 [==============================] - 27s 130ms/step - loss: 0.6829 - accuracy: 0.5845 - val_loss: 0.6797 - val_accuracy: 0.5803
Epoch 00001: val_accuracy improved from -inf to 0.58025, saving model to bilstmv1.h5
Epoch 2/3
127/127 [==============================] - 14s 112ms/step - loss: 0.6788 - accuracy: 0.5851 - val_loss: 0.6798 - val_accuracy: 0.5803
Epoch 00002: val_accuracy did not improve from 0.58025
Epoch 3/3
127/127 [==============================] - 14s 112ms/step - loss: 0.6800 - accuracy: 0.5822 - val_loss: 0.6798 - val_accuracy: 0.5803
Epoch 00003: val_accuracy did not improve from 0.58025
I have tried to change the optimzer, loss_function, and other modification. As you can expect, all the predictions are same since the loss function is not being changed.
You have an issue with your input shape in your first LSTM layer. Keras inputs takes (None, Your_Shape) as its input, since your input to the model can vary. You can have 1 input, 2 inputs, or infinity inputs. The only way to represent dynamic is by using None as the first input. The quickest way to do this is to change the input to (None, *input_shape), since the * will expand your input shape.
Your build function will then become:
def build_nn():
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape = (None, *x_train.shape) , name="one")))
model.add(Dropout(0.20))
model.add(Bidirectional(LSTM(128, return_sequences=True , name="two")))
model.add(Dropout(0.20))
model.add(Bidirectional(LSTM(64, return_sequences=False , name="three")))
model.add(Dropout(0.20))
model.add(Dense(1,activation='sigmoid'))
# opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
opt = SGD(lr=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
Though I still advise having a look at your Optimizer as that might affect your results. You can also use -1 as an input shape which will mean auto fill, but you can only use it once.

Jacobian matrix of logits with respect to image using tf.GradientTape

I am trying to find the Jacobian of logits with respect to input but I do get None and I could not figure it why.
Let'say I have a model, I trained it and saved it.
import tensorflow as tf
print("TensorFlow version: ", tf.__version__)
tf.keras.backend.set_floatx('float64')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
#Normalize the images, between 0-1
x_train, x_test = x_train / 255.0, x_test / 255.0
# Add a channels dimension
x_train = x_train[..., tf.newaxis]
x_test = x_test[..., tf.newaxis]
print(x_train.shape)
#(60000, 28, 28, 1)
print(y_train.shape)
(60000,)
print(x_test.shape)
#(10000, 28, 28, 1)
print(y_test.shape)
#(10000,)
num_class = 10
# Convert labels to one hot encoded vectors.
y_train_oh, y_test_oh = tf.keras.utils.to_categorical(y_train, num_classes= num_class, dtype='float32'), tf.keras.utils.to_categorical(y_test, num_classes= num_class, dtype='float32')
print(y_train_oh.shape)
#(60000, 10)
print(y_test_oh.shape)
#(10000, 10)
batch_size = 32
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train_oh)).shuffle(10000).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test_oh)).batch(batch_size)
IMG_SIZE = (28, 28, 1)
input_img = tf.keras.layers.Input(shape=IMG_SIZE)
hidden_layer_1 = tf.keras.layers.Conv2D(filters = 16, kernel_size = (3, 3), strides=(1, 1), padding='same', activation=tf.nn.relu)(input_img)
hidden_layer_2 = tf.keras.layers.Conv2D(filters = 32, kernel_size = (3, 3), strides=(2, 2), padding='same', activation=tf.nn.relu)(hidden_layer_1)
hidden_layer_3 = tf.keras.layers.Conv2D(filters = 64, kernel_size = (3, 3), strides=(2, 2), padding='same', activation=tf.nn.relu)(hidden_layer_2)
flatten_layer = tf.keras.layers.Flatten()(hidden_layer_3)
output_img = tf.keras.layers.Dense(num_class)(flatten_layer)
#NO SOFTMAX LAYER IN THE END, WE WILL DO IT LATER
#predictions = tf.nn.softmax(logits)
model = tf.keras.Model(input_img, output_img)
model.summary()
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
# This function accepts one-hot encoded labels
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')
#tf.function
def train_step(images, labels):
with tf.GradientTape() as tape:
# training=True is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
predictions = model(images, training=True)
loss = loss_object(labels, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_accuracy(labels, predictions)
#tf.function
def test_step(images, labels):
# training=False is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
predictions = model(images, training=False)
t_loss = loss_object(labels, predictions)
test_loss(t_loss)
test_accuracy(labels, predictions)
# Train the model for 15 epochs.
num_epochs = 15
train_loss_results = []
train_accuracy_results = []
test_loss_results = []
test_accuracy_results = []
for epoch in range(num_epochs):
# Reset the metrics at the start of the next epoch
train_loss.reset_states()
train_accuracy.reset_states()
test_loss.reset_states()
test_accuracy.reset_states()
for images, labels in train_ds:
train_step(images, labels)
for test_images, test_labels in test_ds:
test_step(test_images, test_labels)
train_loss_results.append(train_loss.result())
train_accuracy_results.append(train_accuracy.result())
test_loss_results.append(test_loss.result())
test_accuracy_results.append(test_accuracy.result())
template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
print(template.format(epoch+1,
train_loss.result(),
train_accuracy.result()*100,
test_loss.result(),
test_accuracy.result()*100))
tf.keras.models.save_model(model = model, filepath = 'model.h5', overwrite=True, include_optimizer=True)
# Epoch 1, Loss: 0.1654163608489558, Accuracy: 95.22, Test Loss: 0.061988271648914496, Test Accuracy: 97.88
# Epoch 2, Loss: 0.060983153790452826, Accuracy: 98.15833333333333, Test Loss: 0.044874734015780696, Test Accuracy: 98.53
# Epoch 3, Loss: 0.042541984771347297, Accuracy: 98.69, Test Loss: 0.042536806688480366, Test Accuracy: 98.57000000000001
# Epoch 4, Loss: 0.03330485398344463, Accuracy: 98.98166666666667, Test Loss: 0.039308084282613225, Test Accuracy: 98.64
# Epoch 5, Loss: 0.024959077225852524, Accuracy: 99.205, Test Loss: 0.04370295960736327, Test Accuracy: 98.67
# Epoch 6, Loss: 0.020565333928674955, Accuracy: 99.33666666666666, Test Loss: 0.04245114839809372, Test Accuracy: 98.69
# Epoch 7, Loss: 0.01639637468442185, Accuracy: 99.47666666666667, Test Loss: 0.04561551753656099, Test Accuracy: 98.72999999999999
# Epoch 8, Loss: 0.013642370500962534, Accuracy: 99.56333333333333, Test Loss: 0.04333075060614142, Test Accuracy: 98.83
# Epoch 9, Loss: 0.010697861799085589, Accuracy: 99.655, Test Loss: 0.05918524164135248, Test Accuracy: 98.48
# Epoch 10, Loss: 0.011164671695055153, Accuracy: 99.61666666666666, Test Loss: 0.05492968221334442, Test Accuracy: 98.64
# Epoch 11, Loss: 0.008642793950046499, Accuracy: 99.69833333333334, Test Loss: 0.05367191278261649, Test Accuracy: 98.74000000000001
# Epoch 12, Loss: 0.00788155746288626, Accuracy: 99.74499999999999, Test Loss: 0.06254112380584512, Test Accuracy: 98.68
# Epoch 13, Loss: 0.006521700676742724, Accuracy: 99.77000000000001, Test Loss: 0.06381602274510409, Test Accuracy: 98.7
# Epoch 14, Loss: 0.007104389384812846, Accuracy: 99.75166666666667, Test Loss: 0.05241271737958395, Test Accuracy: 98.87
# Epoch 15, Loss: 0.006479600550850722, Accuracy: 99.77833333333334, Test Loss: 0.06816933916442823, Test Accuracy: 98.74000000000001
You can find the saved model in h5 format in this link, if you do not want to train it.
It works well so far, I can do predictions on some samples:
predictions = model(mnist_twos, training=False)
for i, logits in enumerate(predictions):
class_idx = tf.argmax(logits).numpy()
p = tf.nn.softmax(logits)[class_idx] #probabilities
print("Example {} prediction: {} ({:4.1f}%)".format(i, class_idx, 100*p))
Example 0 prediction: 2 (100.0%)
Example 1 prediction: 2 (100.0%)
Example 2 prediction: 2 (100.0%)
Example 3 prediction: 2 (100.0%)
Example 4 prediction: 2 (100.0%)
Example 5 prediction: 2 (100.0%)
Example 6 prediction: 2 (100.0%)
Example 7 prediction: 2 (100.0%)
Example 8 prediction: 2 (100.0%)
Example 9 prediction: 2 (100.0%)
What I want to do is now to find the jacobian matrix of logits with respect to input image. Since I have 10 selected images, I will have a Jacobian matrix of size (10, 28, 28, 1) since the shape of the MNIST sample is (28, 28, 1). I can do this with Tensorflow 1.0 like:
for i in range(n_class):
if i==0:
j = tf.gradients(tf.reshape(logits, (-1,))[i], X_p)
else:
j = tf.concat([j, tf.gradients(tf.reshape(logits, (-1,))[i], X_p)],axis=0)
where X_p is the placeholder for the image I am feeding in.
X_p = tf.placeholder(shape=[28, 28, 1], dtype=tf.float32)
However, I am currently using Tensorflow 2.0 and I cannot make it work using tf.GradientTape. It always ends up None. This seems to be a common problem for everyone and I followed the examples here but to no avail. Can someone help me about it?
Please check the batch_jacobian method of the GradinetTape. https://www.tensorflow.org/api_docs/python/tf/GradientTape#batch_jacobian
Convert your input to the tf variable if you are getting None gradients even after batch_jacobian.