constant loss values with normal CNNs and transfer learning - tensorflow

I am working on the dataset given in the paper https://arxiv.org/ftp/arxiv/papers/1511/1511.02459.pdf
In this paper, a dataset of images (portraits of people) is labeled by a floating number between 1 and 5 (1 ugly, 5 good looking). I wanted to work on this dataset and use MobileNetV2 with transfer learning (pretrained on Imagenet) in Tensorflow 2.4.0-dev20201009 with CUDA 11.1 on my RTX 3070 8gb. I don't really see my mistake but training my model yields often in constant validation loss, for example:
78/78 [==============================] - ETA: 0s - loss: 52145660442.33472020-11-20 13:19:36.796481: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:596] layout failed: Invalid argument: Size of values 2 does not match size of permutation 4 # fanin shape insequential/dense/BiasAdd-0-TransposeNHWCToNCHW-LayoutOptimizer
78/78 [==============================] - 16s 70ms/step - loss: 51654522711.5709 - val_loss: 9.5415
Epoch 2/300
78/78 [==============================] - 4s 52ms/step - loss: 9.4870 - val_loss: 9.5415
Epoch 3/300
78/78 [==============================] - 4s 52ms/step - loss: 9.3986 - val_loss: 9.5415
Epoch 4/300
78/78 [==============================] - 4s 51ms/step - loss: 9.4950 - val_loss: 9.5415
Epoch 5/300
78/78 [==============================] - 4s 52ms/step - loss: 9.4076 - val_loss: 9.5415
Epoch 6/300
78/78 [==============================] - 4s 52ms/step - loss: 9.4993 - val_loss: 9.5415
Epoch 7/300
78/78 [==============================] - 4s 52ms/step - loss: 9.3758 - val_loss: 9.5415
...
The validation loss would remain constant for 300 epochs. My code can be found here below. Let me summarize:
I used transfer-learning from Imagenet and froze the convolutional base of MobileNetV2.
I added a dense layer as the classificator and 1 output neuron. The loss function I used is MSE. The optimizer in the code is SGD, and I also tried ADAM which could also yield constant loss values on the validation set.
The above error (constant val loss) occurs also with different learning rates and with ADAM. Sometimes the same learning rate yields not constant val loss but reasonable loss. I assume this is due to the randomized weights initialization method on the dense layers in my classificator. I even tried absurd learning_rates like 10, and values are still constant. If the lr is very high then changes should be clearly seen! This is not the case. What is wrong?
My code:
import os
from typing import Dict, Any
from PIL import Image
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras import layers
from tensorflow import keras
import matplotlib.pyplot as plt
import pickle
import numpy as np
import cv2
import random
#method to create the model
def create_model(IMG_SIZE, lr):
#Limit memore usage of GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
tf.config.experimental.set_virtual_device_configuration(gpus[0], [
tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*7)])
except RuntimeError as e:
print(e)
model = keras.Sequential()
model.add(MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))
model.layers[0].trainable = False
model.add(layers.GlobalAveragePooling2D())
model.add(tf.keras.layers.Dropout(0.8))
model.add(layers.Dense(128, activation="relu"))
model.add(layers.Dense(1, activation="relu"))
#use adam or sgd as optimizers
adam = tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.98,
epsilon=1e-9)
sgd = tf.keras.optimizers.SGD(lr=lr, decay=1e-6, momentum=0.5)
model.compile(optimizer=sgd,
loss=tf.losses.mean_squared_error,
)
model.summary()
return model
#preprocessing
def loadImages(IMG_SIZE):
path = os.path.join(os.getcwd(), 'data\\Images')
training_data=[]
labelMap = getLabelMap()
for img in os.listdir(path):
out_array = np.zeros((350,350, 3), np.float32) #original size of images in the dataset
try:
img_array = cv2.imread(os.path.join(path, img))
img_array=img_array.astype('float32') #cast to float because to prevent normalization erros
out_array = cv2.normalize(img_array, out_array, 0, 1, cv2.NORM_MINMAX) #normalize image
out_array = cv2.resize(out_array, (IMG_SIZE, IMG_SIZE)) #resize, bc we need 224x224 for Imagenet pretrained weights
training_data.append([out_array, float(labelMap[img])])
except Exception as e:
pass
return training_data
#preprocessing, the txt file All_labels.txt has lines of the form 'filename.jpg 3.2' and 3.2 is the label
def getLabelMap():
map = {}
path = os.getcwd()
path = os.path.join(path, "data\\train_test_files\\All_labels.txt")
f = open(path, "r")
for line in f:
line = line.split()
map[line[0]] = line[1]
f.close()
return map
#not important, in case you want to see the images after preprocessing
def showimg(image):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(image)
plt.show()
#pickle the preprocessed data
def pickle_it(training_set, IMG_SIZE):
X = []
Y = []
for features, label in training_set:
X.append(features)
Y.append(label)
X = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y = np.array(Y)
pickle_out = open("X.pickle", "wb")
pickle.dump(X, pickle_out)
pickle_out.close()
pickle_out = open("Y.pickle", "wb")
pickle.dump(Y, pickle_out)
pickle_out.close()
#for prediction after training the model
def betterThan(y, Y):
Z=np.sort(Y)
cnt = 0
for z in Z:
if z>y:
break
else:
cnt = cnt+1
return float(cnt/len(Y))
#for prediction after training the model
def predictImage(image, model, Y):
img_array = cv2.imread(image)
img_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
img_array = np.array(img_array).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
y = model.predict(img_array)
per = betterThan(y, Y)
print('You look better than ' + str(per) + '% of the dataset')
#Main/Driver function
#Preprocessing
IMG_SIZE = 224
training_set=[]
training_set = loadImages(IMG_SIZE)
random.shuffle(training_set)
pickle_it(training_set, IMG_SIZE) #I pickle my data, so that I don't always have to go through the preprocessing
#Load preprocessed data
X = pickle.load(open("X.pickle", "rb"))
Y = pickle.load(open("Y.pickle", "rb"))
#Just to check that the images look correct
showimg(X[0])
# define the grid search parameters, feel free to edit the grids
batch_size = [64]
epochsGrid = [300]
learning_rate = [0.1]
#save models and best parameters found in grid search
size_histories = {}
min_val_loss = 10
best_para = {}
#ignore this, used for bugs on my gpu... You possibly don't need this
config = tf.compat.v1.ConfigProto(gpu_options=tf.compat.v1.GPUOptions(allow_growth=True))
sess = tf.compat.v1.Session(config=config)
#grid search, training the model
for epochs in epochsGrid:
for batch in batch_size:
for lr in learning_rate:
model = create_model(IMG_SIZE, lr)
model_name = str(epochs) + '_' + str(batch) + '_' + str(lr)
#train the model with the given hyperparameters
size_histories[model_name] = model.fit(X, Y, batch_size=batch, epochs=epochs, validation_split=0.1)
# save model with the best loss value
if min(size_histories[model_name].history['val_loss']) < min_val_loss:
min_val_loss = min(size_histories[model_name].history['val_loss'])
best_para['epoch'] = epochs
best_para['batch'] = batch
best_para['lr'] = lr
model.save('savedModel')
#If you want to make prediction
model = tf.keras.models.load_model("savedModel")
image = os.path.join(os.getcwd(), 'data\\otherImages\\beautifulWomen.jpg')
predictImage(image, model, Y)
EDIT:
I have found the issue. It is 'relu' in the output neuron. When I change my loss from RMSE to MAPE I will see that I got a 100 percent error on validation. I assume this is because all my validation data is output to 0. This is only possible when the value in the output neuron before 'relu' is negative. I don't know why this is the case. But removing 'relu' will yield better training.
Does anyone know why 'relu' causes this problem with regression problems?

If this is your last layer
model.add(layers.Dense(1, activation="relu"))
then your models final output is y if y > 0 else 0. At your untrained state, your model could very well have y pinned to something like -17 or 17 with fairly equal chance. In the case of -17, the relu will convert that to 0 and also set the gradient to 0, which means the network doesn't learn. Yeah, the network doesn't learn anything from any part of a network where a relu unit output 0. In the case of the layer before
model.add(layers.Dense(128, activation="relu"))
there will be a really good chance that about half of the units will fire with a positive value and so they learn, so that layer is fine.
What can be done in the case of a bad initialization or after training a bad state in which the output of that last layer is pushed down to below 0? Well, what if we just don't use relu. What activation to use? None! Let's look at what that would be
1: model = keras.Sequential()
2: model.add(MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))
3: model.layers[0].trainable = False
4: model.add(layers.GlobalAveragePooling2D())
5: model.add(tf.keras.layers.Dropout(0.8))
6: model.add(layers.Dense(128, activation="relu"))
7: model.add(layers.Dense(1))
Lines 1-6 are all the same. It is important to note that the output of line 6 passes through the non-linear relu activation, and so there is the capability to learn non-linearities. Line 7, without an activation function will be a linear combination of Line 6, with a full ability to generate gradients in the positive and negative output region. When backprop is applied to learn the target values of 1 to 5, if the network outputs -17, it can learn to output a larger number. Yeah!
If you'd like to have 2 layers of nonlinearity, I'd suggest the following
1: model = keras.Sequential()
2: model.add(MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))
3: model.layers[0].trainable = False
4: model.add(layers.GlobalAveragePooling2D())
5: model.add(layers.Dense(128, activation="tanh"))
6: model.add(layers.Dense(64, activation="tanh"))
7: model.add(layers.Dense(1))
Ditch the dropout unless you have actual proof that it helps in this very specific network (and right now I suspect you don't). Try tanh as your hidden layer activation function. It has some nice features, like being positive and negative, gradient even with large and/or negative numbers, and acts somewhat to automatically regularize weights. But, importantly, the last output either has no activation function.

Related

eager mode and keras.fit have different results

I am trying to convert model.fit() in Keras to the eager mode training. The model is an autoencoder. It has one encoder and two decoders. The decoders have different loss functions. The losses for decoders in eager model and model.fit are the same. I tried to set everything as the model.fit(). But the losses are different. I really appreciate help me out.
The link for google colab: https://colab.research.google.com/drive/1XNOwJ9oVgs1z9qqXIs_ldnKuSm3Dn2Ud?usp=sharing
In the following, the definition and training of the model are shown. I use model.fit() for training. Also, in the end, the output is shown, which shows the values for losses.
def fit_ae (x_unlab, p_m, alpha, parameters):
# Parameters
_, dim = x_unlab.shape
epochs = parameters['epochs']
batch_size = parameters['batch_size']
# Build model
inputs = contrib_layers.Input(shape=(dim,))
# Encoder
h = contrib_layers.Dense(int(256), activation='relu', name='encoder1')(inputs)
h = contrib_layers.Dense(int(128), activation='relu', name='encoder2')(h)
h = contrib_layers.Dense(int(26), activation='relu', name='encoder3')(h)
# Mask estimator
output_1 = contrib_layers.Dense(dim, activation='sigmoid', name = 'mask')(h)
# Feature estimator
output_2 = contrib_layers.Dense(dim, activation='sigmoid', name = 'feature')(h)
#Projection Network
model = Model(inputs = inputs, outputs = [output_1, output_2])
model.compile(optimizer='rmsprop',
loss={'mask': 'binary_crossentropy',
'feature': 'mean_squared_error'},
loss_weights={'mask':1, 'feature':alpha})
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
# Fit model on unlabeled data
model.fit(x_tilde, {'mask': m_label, 'feature': x_unlab}, epochs = epochs, batch_size= batch_size)
########### OUTPUT
Epoch 1/15
4/4 [==============================] - 1s 32ms/step - loss: 1.0894 - mask_loss: 0.6560 - feature_loss: 0.2167
Epoch 2/15
4/4 [==============================] - 0s 23ms/step - loss: 0.6923 - mask_loss: 0.4336 - feature_loss: 0.1293
Epoch 3/15
4/4 [==============================] - 0s 26ms/step - loss: 0.4720 - mask_loss: 0.3022 - feature_loss: 0.0849
Epoch 4/15
4/4 [==============================] - 0s 23ms/step - loss: 0.4054 - mask_loss: 0.2581 - feature_loss: 0.0736
In the following code, I implemented the above code in eager mode. I set all optimizer and loss functions same as the above code. Data are the same for training both model.
###################################################### MODEL AUTOENCODER ============================================
def eager_ae(x_unlab,p_m,alpha,parameters):
# import pdb; pdb.set_trace()
_, dim = x_unlab.shape
epochs = parameters['epochs']
batch_size = parameters['batch_size']
E = keras.Sequential([
Input(shape=[dim,]),
Dense(256,activation='relu'),
Dense(128,activation='relu'),
Dense(26,activation='relu'),
])
# Mask estimator
output_1 = keras.Sequential([
Dense(dim,activation='sigmoid'),
])
# Feature estimator
output_2 = keras.Sequential([
Dense(dim,activation='sigmoid'),
])
optimizer = tf.keras.optimizers.RMSprop()
loss_mask = tf.keras.losses.BinaryCrossentropy()
loss_feature = tf.keras.losses.MeanSquaredError()
# Generate corrupted samples
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
for epoch in range(epochs):
loss_metric = tf.keras.metrics.Mean(name='train_loss')
len_batch = range(int(x_unlab.shape[0]/batch_size))
for i in len_batch:
samples = x_tilde[i*batch_size:(i+1)*batch_size]
mask = m_label[i*batch_size:(i+1)*batch_size]
# train_step(samples,tgt)
with tf.GradientTape() as tape:
latent = E(samples, training=True)
out_mask = output_1(latent)
out_feat = output_2(latent)
# import pdb; pdb.set_trace()
lm = loss_mask(out_mask,tf.Variable(mask,dtype=tf.float32))
lf = loss_feature(out_feat,tf.Variable(samples,dtype=tf.float32))
pred_loss = lm + alpha*lf
trainable_vars = E.trainable_weights+output_1.trainable_weights+output_2.trainable_weights
grads = tape.gradient(pred_loss, trainable_vars)
optimizer.apply_gradients(zip(grads, trainable_vars))
loss_metric.update_state(pred_loss)
print(f'Epoch {epoch}, Loss {loss_metric.result()}')
return E
############# OUTPUT
Epoch 0, Loss 7.902271747589111
Epoch 1, Loss 5.336598873138428
Epoch 2, Loss 2.880791664123535
Epoch 3, Loss 1.9296690225601196
Epoch 4, Loss 1.6377944946289062
Epoch 5, Loss 1.5342860221862793
Epoch 6, Loss 1.5015968084335327
Epoch 7, Loss 1.4912563562393188
The total loss in the first code is less than zero (≈0.25), while the total loss in the second code is more than 1 (≈1.3). I can not find the issue in my second implementation (the second code).

Tensorflow 2: How to fit a subclassed model that returns multiple values in the call method?

I built the following model via Model Subclassing in TensorFlow 2:
from tensorflow.keras import Model, Input
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.layers import Flatten, Dense
class Detector(Model):
def __init__(self, num_classes=3, name="DenseNet201"):
super(Detector, self).__init__(name=name)
self.feature_extractor = DenseNet201(
include_top=False,
weights="imagenet",
)
self.feature_extractor.trainable = False
self.flatten_layer = Flatten()
self.prediction_layer = Dense(num_classes, activation=None)
def call(self, inputs):
x = preprocess_input(inputs)
extracted_feature = self.feature_extractor(x, training=False)
x = self.flatten_layer(extracted_feature)
y_hat = self.prediction_layer(x)
return extracted_feature, y_hat
The subsequent steps are compiling and fitting the model. The model compiled as normal but when fitting my image generator (built from ImageDataGenerator), I encountered the error: InvalidArgumentError: Incompatible shapes: [64,18,18] vs. [64,1] [[node Equal (defined at :19) ]] [Op:__inference_train_function_32187] Function call stack: train_function –.
history = detector.fit(
train_generator,
epochs=1,
validation_data=val_generator,
callbacks=callbacks
)
This is obvious because TensorFlow does not know whether the prediction is y_hat or extracted_featureduring detector.fit() and thus threw an error. So, what is the right implementation of detector.fit for my case?
Following this question-answer1, you should first train your model with (let's say) one input and one output. And later if you want to compute grad-cam, you would pick some intermediate layer of your base model (not the final output of the base model) and in that case, you need to build your feature extractor separately. For example
# (let's say: one input and one output)
# use for training
base_model = keras.application(...)
x = base_model(..)
dese_drop_bn_[whatever] = x
out = dese_drop_bn_[whatever]
model = Model(base_model.input, out)
# inference / we need to compute grad cam
new_model = tf.keras.models.Model(model.input,
[model.layers[15].output, model.output])
In the above, the model is used for training, and later in inference time if you need to compute grad-cam based on the layer for example layer number 15, you need to build new_model with appropriate outputs. Hope this makes things clear. For more information about feature extraction, see the official doc, Extract and reuse nodes in the graph of layers2. FYI, the exact same things are happening here as I informed you earlier. Also, check this official code example, you will see exact same thing there.
However, there is another way that I'm thinking might work for your easily. That is, as you're using a custom model, we can take the privilege training argument in the call() method. Normally in training time, this is True and for inference time it's False. So, based on this, we can return desired output the accordingly. Here is the complete code example:
import tensorflow as tf
# get some data
data_dir = tf.keras.utils.get_file(
'flower_photos',
'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
untar=True)
datagen_kwargs = dict(rescale=1./255, validation_split=.20)
dataflow_kwargs = dict(target_size=(64, 64),
batch_size=16,
interpolation="bilinear")
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
rotation_range=40,
horizontal_flip=True,
width_shift_range=0.2, height_shift_range=0.2,
shear_range=0.2, zoom_range=0.2,
**datagen_kwargs)
train_generator = train_datagen.flow_from_directory(
data_dir, subset="training", shuffle=True, **dataflow_kwargs)
for image, label in train_generator:
print(image.shape, image.dtype)
print(label.shape, label.dtype)
print(label[:4])
break
(16, 64, 64, 3) float32
(16, 5) float32
[[0. 0. 0. 0. 1.]
[0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1.]]
Here we do that trick based on the boolean value of training in the call method.
class Detector(Model):
def __init__(self, num_classes=5, name="DenseNet201"):
super(Detector, self).__init__(name=name)
self.feature_extractor = DenseNet201(
include_top=False,
weights="imagenet",
)
self.feature_extractor.trainable = False
self.flatten_layer = Flatten()
self.prediction_layer = Dense(num_classes, activation='softmax')
def call(self, inputs, training):
x = preprocess_input(inputs)
extracted_feature = self.feature_extractor(x, training=False)
x = self.flatten_layer(extracted_feature)
y_hat = self.prediction_layer(x)
if training:
return y_hat
else:
return [y_hat, extracted_feature]
Train
det = Detector()
det.compile(loss='categorical_crossentropy',
optimizer='adam', metrics=['acc'])
train_step = train_generator.samples // train_generator.batch_size
det.fit(train_generator,
steps_per_epoch=train_step,
validation_data=train_generator,
validation_steps=train_step,
epochs=2, verbose=2)
Epoch 1/2
37s 139ms/step - loss: 1.7543 - acc: 0.2650 - val_loss: 1.5310 - val_acc: 0.3764
Epoch 2/2
21s 115ms/step - loss: 1.4913 - acc: 0.3915 - val_loss: 1.3066 - val_acc: 0.4667
<tensorflow.python.keras.callbacks.History at 0x7fa2890b1790>
Evaluate
det.evaluate(train_generator,
steps=train_step)
4s 76ms/step - loss: 1.3066 - acc: 0.4667
[1.3065541982650757, 0.46666666865348816]
Inference
Here, we will get two outputs of this model (unlike 1 output that we've got in the training time).
y_hat, base_feature = det.predict(train_generator,
steps=train_step)
y_hat.shape, base_feature.shape
((720, 5), (720, 2, 2, 1920))
Now, you can do grad-cam or whatever require such feature maps.

Selecting loss and metrics for Tensorflow model

I'm trying to do transfer learning, using a pretrained Xception model with a newly added classifier.
This is the model:
base_model = keras.applications.Xception(
weights="imagenet",
input_shape=(224,224,3),
include_top=False
)
The dataset I'm using is oxford_flowers102 taken directly from tensorflow datasets.
This is a dataset page.
I have a problem with selecting some parameters - either training accuracy shows suspiciously low values, or there's an error.
I need help with specifying this parameter, for this (oxford_flowers102) dataset:
Newly added dense layer for the classifier. I was trying with:
outputs = keras.layers.Dense(102, activation='softmax')(x) and I'm not sure whether I should select the activation function here or not.
loss function for model.
metrics.
I tried:
model.compile(
optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.Accuracy()],
)
I'm not sure whether it should be SparseCategoricalCrossentropy or CategoricalCrossentropy, and what about from_logits parameter?
I'm also not sure whether should I choose for metricskeras.metrics.Accuracy() or keras.metrics.CategoricalAccuracy()
I am definitely lacking some theoretical knowledge, but right now I just need this to work. Looking forward to your answers!
About the data set: oxford_flowers102
The dataset is divided into a training set, a validation set, and a test set. The training set and validation set each consist of 10 images per class (totaling 1020 images each). The test set consists of the remaining 6149 images (minimum 20 per class).
'test' 6,149
'train' 1,020
'validation' 1,020
If we check, we'll see
import tensorflow_datasets as tfds
tfds.disable_progress_bar()
data, ds_info = tfds.load('oxford_flowers102',
with_info=True, as_supervised=True)
train_ds, valid_ds, test_ds = data['train'], data['validation'], data['test']
for i, data in enumerate(train_ds.take(3)):
print(i+1, data[0].shape, data[1])
1 (500, 667, 3) tf.Tensor(72, shape=(), dtype=int64)
2 (500, 666, 3) tf.Tensor(84, shape=(), dtype=int64)
3 (670, 500, 3) tf.Tensor(70, shape=(), dtype=int64)
ds_info.features["label"].num_classes
102
So, it has 102 categories or classes and the target comes with an integer with different shapes input.
Clarification
First, if you keep this integer target or label, you should use sparse_categorical_accuracy for accuracy and sparse_categorical_crossentropy for loss function. But if you transform your integer label to a one-hot encoded vector, then you should use categorical_accuracy for accuracy, and categorical_crossentropy for loss function. As these data set have integer labels, you can choose sparse_categorical or you can transform the label to one-hot in order to use categorical.
Second, if you set outputs = keras.layers.Dense(102, activation='softmax')(x) to the last layer, you will get probabilities score. But if you set outputs = keras.layers.Dense(102)(x), then you will get logits. So, if you set activations='softmax', then you should not use from_logit = True. For example in your above code you should do as follows (here's some theory for you):
...
(a)
# Use softmax activation (no logits output)
outputs = keras.layers.Dense(102, activation='softmax')(x)
...
model.compile(
optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
metrics=[keras.metrics.Accuracy()],
)
or,
(b)
# no activation, output will be logits
outputs = keras.layers.Dense(102)(x)
...
model.compile(
optimizer=keras.optimizers.Adam(),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.Accuracy()],
)
Third, keras uses string identifier such as metrics=['acc'] , optimizer='adam'. But in your case, you need to be a bit more specific as you mention loss function specific. So, instead of keras.metrics.Accuracy(), you should choose keras.metrics.SparseCategoricalAccuracy() if you target are integer or keras.metrics.CategoricalAccuracy() if your target are one-hot encoded vector.
Code Examples
Here is an end-to-end example. Note, I will transform integer labels to a one-hot encoded vector (right now, it's a matter of preference to me). Also, I want probabilities (not logits) from the last layer which means from_logits = False. And for all of these, I need to choose the following parameters in my training:
# use softmax to get probabilities
outputs = keras.layers.Dense(102,
activation='softmax')(x)
# so no logits, set it false (FYI, by default it already false)
loss = keras.losses.CategoricalCrossentropy(from_logits=False),
# specify the metrics properly
metrics = keras.metrics.CategoricalAccuracy(),
Let's complete the whole code.
import tensorflow_datasets as tfds
tfds.disable_progress_bar()
data, ds_info = tfds.load('oxford_flowers102',
with_info=True, as_supervised=True)
train_ds, valid_ds, test_ds = data['train'], data['validation'], data['test']
NUM_CLASSES = ds_info.features["label"].num_classes
train_size = len(data['train'])
batch_size = 64
img_size = 120
Preprocess and Augmentation
import tensorflow as tf
# pre-process functions
def normalize_resize(image, label):
image = tf.cast(image, tf.float32)
image = tf.divide(image, 255)
image = tf.image.resize(image, (img_size, img_size))
label = tf.one_hot(label , depth=NUM_CLASSES) # int to one-hot
return image, label
# augmentation
def augment(image, label):
image = tf.image.random_flip_left_right(image)
return image, label
train = train_ds.map(normalize_resize).cache().map(augment).shuffle(100).\
batch(batch_size).repeat()
valid = valid_ds.map(normalize_resize).cache().batch(batch_size)
test = test_ds.map(normalize_resize).cache().batch(batch_size)
Model
from tensorflow import keras
base_model = keras.applications.Xception(
weights='imagenet',
input_shape=(img_size, img_size, 3),
include_top=False)
base_model.trainable = False
inputs = keras.Input(shape=(img_size, img_size, 3))
x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(NUM_CLASSES, activation='softmax')(x)
model = keras.Model(inputs, outputs)
Okay, additionally, here I like to use two metrics to compute top-1 and top-3 accuracy.
model.compile(optimizer=keras.optimizers.Adam(),
loss=keras.losses.CategoricalCrossentropy(),
metrics=[
keras.metrics.TopKCategoricalAccuracy(k=3, name='acc_top3'),
keras.metrics.TopKCategoricalAccuracy(k=1, name='acc_top1')
])
model.fit(train, steps_per_epoch=train_size // batch_size,
epochs=20, validation_data=valid, verbose=2)
...
Epoch 19/20
15/15 - 2s - loss: 0.2808 - acc_top3: 0.9979 - acc_top1: 0.9917 -
val_loss: 1.5025 - val_acc_top3: 0.8147 - val_acc_top1: 0.6186
Epoch 20/20
15/15 - 2s - loss: 0.2743 - acc_top3: 0.9990 - acc_top1: 0.9885 -
val_loss: 1.4948 - val_acc_top3: 0.8147 - val_acc_top1: 0.6255
Evaluate
# evaluate on test set
model.evaluate(test, verbose=2)
97/97 - 18s - loss: 1.6482 - acc_top3: 0.7733 - acc_top1: 0.5994
[1.648208498954773, 0.7732964754104614, 0.5994470715522766]

Tensorflow Probability returns unstable Predictions

I'm using a Tensorflow Probability model. Of course is a probabilistic outcome, and the derivative of error does not go to zero (otherwise the model would be deterministic). The prediction is not stable, because we have a range in the derivative of loss, let's say, in a convex optimization, from 1.2 to 0.2 as an example.
This interval generates a different prediction each time the model is trained. Sometimes I get an excellent fit (red=real, blue lines=predicted +2 std deviation and -2 std deviation):
Sometimes not, with same hyper-parameters:
Sometimes mirrored:
For business purposes, this is quite problematic, given that it is expected that a prediction presents a stable output.
Here is the code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
np.random.seed(42)
dataframe = pd.read_csv('Apple_Data_300.csv').ix[0:800,:]
dataframe.head()
plt.plot(range(0,dataframe.shape[0]),dataframe.iloc[:,1])
x1=np.array(dataframe.iloc[:,1]+np.random.randn(dataframe.shape[0])).astype(np.float32).reshape(-1,1)
y=np.array(dataframe.iloc[:,1]).T.astype(np.float32).reshape(-1,1)
tfd = tfp.distributions
model = tf.keras.Sequential([
tf.keras.layers.Dense(1,kernel_initializer='glorot_uniform'),
tfp.layers.DistributionLambda(lambda t: tfd.Normal(loc=t, scale=1)),
tfp.layers.DistributionLambda(lambda t: tfd.Normal(loc=t, scale=1)),
tfp.layers.DistributionLambda(lambda t: tfd.Normal(loc=t, scale=1))
])
negloglik = lambda x, rv_x: -rv_x.log_prob(x)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001), loss=negloglik)
model.fit(x1,y, epochs=500, verbose=True)
yhat = model(x1)
mean = yhat.mean()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
mm = sess.run(mean)
mean = yhat.mean()
stddev = yhat.stddev()
mean_plus_2_std = sess.run(mean - 2. * stddev)
mean_minus_2_std = sess.run(mean + 2. * stddev)
plt.figure(figsize=(8,6))
plt.plot(y,color='red',linewidth=1)
#plt.plot(mm)
plt.plot(mean_minus_2_std,color='blue',linewidth=1)
plt.plot(mean_plus_2_std,color='blue',linewidth=1)
Loss:
Epoch 498/500
801/801 [==============================] - 0s 32us/sample - loss: 2.4169
Epoch 499/500
801/801 [==============================] - 0s 30us/sample - loss: 2.4078
Epoch 500/500
801/801 [==============================] - 0s 31us/sample - loss: 2.3944
Is there a way to control the prediction output for a probabilistic model? The loss stops at 1.42, even decreasing learning rate and increasing training epochs. What am I missing here ?
WORKING CODE AFTER ANSWER:
init = tf.global_variables_initializer()
with tf.Session() as sess:
model = tf.keras.Sequential([
tf.keras.layers.Dense(1,kernel_initializer='glorot_uniform'),
tfp.layers.DistributionLambda(lambda t: tfd.Normal(loc=t, scale=1))
])
negloglik = lambda x, rv_x: -rv_x.log_prob(x)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001), loss=negloglik)
model.fit(x1,y, epochs=500, verbose=True, batch_size=16)
yhat = model(x1)
mean = yhat.mean()
sess.run(init)
mm = sess.run(mean)
mean = yhat.mean()
stddev = yhat.stddev()
mean_plus_2_std = sess.run(mean - 3. * stddev)
mean_minus_2_std = sess.run(mean + 3. * stddev)
Are you running tf.global_variables_initializer too late?
I found this in the answer of Understanding tf.global_variables_initializer:
Variable initializers must be run explicitly before other ops in your
model can be run. The easiest way to do that is to add an op that runs
all the variable initializers, and run that op before using the model.

Keras validation accuracy much lower than training accuracy even with the same dataset for both training and validation

We tried the transfer learning with Keras ResNet50 application (Tensorflow as backend) with our own dataset for 2000 classes with 14000 images as training set and 5261 images as validation set. The training results we got are much different in both loss and accuracy for training vs validation. Then, we tried to use the same images for both training and validation, i.e. trained with 14000 images and validated with the same 14000 images, training results for the attempt are similar, i.e. high training accuracy and low validation accuracy.
Keras version: 2.1.6
Tensorflow version: 1.8.0
Code (same dataset for both training and validation) as below,
from __future__ import print_function
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.preprocessing.image import ImageDataGenerator
from datetime import datetime
from keras.optimizers import SGD
import numpy as np
batch_size = 28 # tweak to your GPUs capacity
img_height = 224 # ResNetInceptionv2 & Xception like 299, ResNet50 & VGG like 224
img_width = img_height
channels = 3
input_shape = (img_height, img_width, channels)
best_model = 'best_model.h5'
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
train_generator = train_datagen.flow_from_directory(
'data/train', # this is the target directory
target_size=(img_height, img_width),
batch_size=batch_size,
class_mode='categorical')
classes = len(train_generator.class_indices)
n_of_train_samples = train_generator.samples
callbacks = [ModelCheckpoint(filepath=best_model, verbose=0, save_best_only=True),
EarlyStopping(monitor='val_acc', patience=3, verbose=0)]
base_model = ResNet50(input_shape=input_shape, weights='imagenet', include_top=False)
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional ResNet50 layers
for layer in base_model.layers:
layer.trainable = False
pool_layer = [layer for layer in base_model.layers if layer.name == 'avg_pool'][0]
base_model = Model(base_model.input, pool_layer.input)
base_model.layers.pop()
dropout=[.25,.25]
dense=1024
last = base_model.output
a = MaxPooling2D(pool_size=(7,7),name='maxpool')(last)
b = AveragePooling2D(pool_size=(7,7),name='avgpool')(last)
x = concatenate([a,b], axis = 1)
x = Flatten()(x)
x = Dense(dense, init='uniform', activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(dropout[0])(x)
x = Dense(classes, activation='softmax')(x)
model = Model(base_model.input, outputs=x)
print("Start time: %s" % str(datetime.now()))
# compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer=SGD(lr=1e-2, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
# train the model on the new data for a few epochs
model.fit_generator(
train_generator,
steps_per_epoch=n_of_train_samples//batch_size,
epochs=3,
validation_data=train_generator,
validation_steps=n_of_train_samples//batch_size,
callbacks=callbacks)
print("End time: %s" % str(datetime.now()))
Training result as below
Found 14306 images belonging to 2000 classes.
Start time: 2018-05-21 10:51:34.459545
Epoch 1/3
510/510 [==============================] - 10459s 21s/step - loss: 5.6433 - acc: 0.1538 - val_loss: 9.8465 - val_acc: 0.0024
Epoch 2/3
510/510 [==============================] - 10258s 20s/step - loss: 1.3632 - acc: 0.8550 - val_loss: 10.3264 - val_acc: 0.0044
Epoch 3/3
510/510 [==============================] - 63640s 125s/step - loss: 0.2367 - acc: 0.9886 - val_loss: 10.4537 - val_acc: 0.0034
End time: 2018-05-22 10:17:42.028052
We understood that we shouldn't use the same dataset for both training and validation but we just could not understand why Keras give us high differences in both loss and accuracy for training vs validation when the dataset are the same for both training and validation.
ps. We tried the same dataset, i.e 2000 classes with 14000 images as training set and 5261 images as validation set with fast.ai library ResNet50 and the training loss and validation loss are not much difference. Codes and results with fast.ai library as below
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from datetime import datetime
PATH = "data/"
sz=224
arch=resnet50
bs=28
tfms = tfms_from_model(arch, sz)
data = ImageClassifierData.from_paths(PATH, tfms=tfms, bs=bs)
learn = ConvLearner.pretrained(arch, data, precompute=False)
print("Start time: %s" % str(datetime.now()))
learn.fit(1e-2, 5)
print("End time: %s" % str(datetime.now()))
Start time: 2018-05-02 18:08:51.644750
0%| | 1/487 [00:14<2:00:00, 14.81s/it, loss=tensor(7.5704)]
[0. 6.13229 5.2504 0.26458]
[1. 3.70098 2.74378 0.6752 ]
[2. 1.80197 1.08414 0.88106]
[3. 0.83221 0.50391 0.9424 ]
[4. 0.45565 0.31056 0.95554]
End time: 2018-05-03 00:27:13.147758
Not an answer, but a suggestion to see the non-affected loss/metrics per batch:
def batchEnd(batch,logs):
print("\nfinished batch " + str(batch) + ": " + str(logs) + "\n")
metricCallback = LambdaCallback(on_batch_end=batchEnd)
callbacks = [ metricCallback,
ModelCheckpoint(filepath=best_model, verbose=0, save_best_only=True),
EarlyStopping(monitor='val_acc', patience=3, verbose=0)]
With this, you will see the metrics for each batch without the influence of other batches. (Assuming Keras does some kind of averaging/totaling when it shows the metrics for an epoch).
each time you start your fitting - it can give different results, because initial weights are being loaded different (in multi-thread env. of the library)... and if you have imbalanced dataset - it is also hard to think about the correctness of results... besides I always believe that minimum 50-100 epochs are needed to get rather reliable result (3 is not sufficient)