Difference between valid accuracy from CategoricalAccuracy() and manual prediction - tensorflow

I used CategoricalAccuracy() to calculate the valid_accuracy during training and reached 0.733 in a certain round. After training, I use the model weights saved under the epoch to predict the same valid data set, and the accuracy obtained is only 0.38. Why is this?
This is my training log
I only divided the data into two sets: training set and validation set.
There are 517 images in my validation set. I use this model to make manual predictions and compare them with the true labels. Only 199 images are correctly predicted. The accuracy rate is only 0.38, which is less than 0.733.
Even if it is overfitting, I think these two values should be close.
for epoch in range(EPOCHS):
step = 0
for features in train_dataset:
step += 1
images, labels = process_features(features, data_augmentation=True)
train_step(images, labels)
print("Epoch: {}/{}, step: {}/{}, loss: {:.5f}, accuracy: {:.5f}".format(epoch,
math.ceil(train_count / BATCH_SIZE),
for features in valid_dataset:
valid_images, valid_labels = process_features(features, data_augmentation=False)
valid_step(valid_images, valid_labels)
print("Epoch: {}/{}, train loss: {:.5f}, train accuracy: {:.5f}, "
"valid loss: {:.5f}, valid accuracy: {:.5f}".format(epoch,
def process_features(features, data_augmentation):
image_raw = features['image_raw'].numpy()
image_tensor_list = []
for image in image_raw:
image_tensor = load_and_preprocess_image(image, data_augmentation=data_augmentation)
images = tf.stack(image_tensor_list, axis=0)
labels = features['label'].numpy()
new_labels = tf.one_hot(labels,7)
return images, new_labels
def load_and_preprocess_image(image_raw, data_augmentation=False):
# decode
image_tensor = tf.io.decode_image(contents=image_raw, channels=CHANNELS, dtype=tf.dtypes.float32)
if data_augmentation:
image = tf.image.random_flip_left_right(image=image_tensor)
image = tf.image.resize_with_crop_or_pad(image=image,
target_height=int(IMAGE_HEIGHT * 1.2),
target_width=int(IMAGE_WIDTH * 1.2))
image = tf.image.random_crop(value=image, size=[IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS])
image = tf.image.random_brightness(image=image, max_delta=0.5)
image = tf.image.resize(image_tensor, [IMAGE_HEIGHT, IMAGE_WIDTH])
return image
def valid_step(image_batch, label_batch):
predictions = model(image_batch, training=False)
v_loss = loss_object(label_batch, predictions)
valid_accuracy.update_state(y_true=label_batch, y_pred=predictions)
The following is my code to detect each picture of validation set
import tensorflow as tf
from configuration import save_model_dir, test_image_dir
from prepare_data import load_and_preprocess_image
from train import get_model
def get_single_picture_prediction(model, picture_dir):
image_tensor = load_and_preprocess_image(tf.io.read_file(filename=picture_dir), data_augmentation=False)
image = tf.expand_dims(image_tensor, axis=0)
prediction = model(image, training=False)
pred_class = tf.math.argmax(prediction, axis=-1)
return pred_class
if __name__ == '__main__':
# GPU settings
gpus = tf.config.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# load the model
model = get_model()
pred_class = get_single_picture_prediction(model, test_image_dir)
pytorch and tensorfelow lossfunction

I have tried getting Tensorflow and Pytorch CrossEntropyLoss but it returns different values and I don't know why. I find a solution for this problem
solution link
but i cant fix my two model
please help me
my tensorflow model feed forward neural network
model.add(keras.layers.Dense(units=256,activation="relu", use_bias=True))
model.add(keras.layers.Dense(units=128,activation="relu", use_bias=True))
model.add(keras.layers.Dense(units=64,activation="relu", use_bias=True))
# Compile the model
optimizer=tf.keras.optimizers.Adam(0.0001), # Utilize optimizer
# Train the network
history1 = model.fit(
my pytorch model feed forward neural network
input_size = 784
hidden_sizes = [256,128, 64]
output_size = 10
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.Linear(hidden_sizes[1], hidden_sizes[2]),
nn.Linear(hidden_sizes[2], output_size),
criterion = nn.CrossEntropyLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)
logps = model(images) #log probabilities
loss = criterion(logps, labels) #calculate the NLL loss
optimizer = optim.Adam(model.parameters(), lr=0.0001)
time0 = time()
epochs = 15
for e in range(epochs):
running_loss = 0
running_loss_val = 0
for images, labels in trainloader:
# Flatten MNIST images into a 784 long vector
images = images.view(images.shape[0], -1)
output = model(images)
loss = criterion(output, labels)
running_loss += loss.item()
print("Epoch {} - Training loss: {} - validation loss: {}".format(e, running_loss/len(trainloader), running_loss_val/len(valloader)))

constant loss values with normal CNNs and transfer learning

I am working on the dataset given in the paper https://arxiv.org/ftp/arxiv/papers/1511/1511.02459.pdf
In this paper, a dataset of images (portraits of people) is labeled by a floating number between 1 and 5 (1 ugly, 5 good looking). I wanted to work on this dataset and use MobileNetV2 with transfer learning (pretrained on Imagenet) in Tensorflow 2.4.0-dev20201009 with CUDA 11.1 on my RTX 3070 8gb. I don't really see my mistake but training my model yields often in constant validation loss, for example:
78/78 [==============================] - ETA: 0s - loss: 52145660442.33472020-11-20 13:19:36.796481: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:596] layout failed: Invalid argument: Size of values 2 does not match size of permutation 4 # fanin shape insequential/dense/BiasAdd-0-TransposeNHWCToNCHW-LayoutOptimizer
78/78 [==============================] - 16s 70ms/step - loss: 51654522711.5709 - val_loss: 9.5415
Epoch 2/300
78/78 [==============================] - 4s 52ms/step - loss: 9.4870 - val_loss: 9.5415
Epoch 3/300
78/78 [==============================] - 4s 52ms/step - loss: 9.3986 - val_loss: 9.5415
Epoch 4/300
78/78 [==============================] - 4s 51ms/step - loss: 9.4950 - val_loss: 9.5415
Epoch 5/300
78/78 [==============================] - 4s 52ms/step - loss: 9.4076 - val_loss: 9.5415
Epoch 6/300
78/78 [==============================] - 4s 52ms/step - loss: 9.4993 - val_loss: 9.5415
Epoch 7/300
78/78 [==============================] - 4s 52ms/step - loss: 9.3758 - val_loss: 9.5415
The validation loss would remain constant for 300 epochs. My code can be found here below. Let me summarize:
I used transfer-learning from Imagenet and froze the convolutional base of MobileNetV2.
I added a dense layer as the classificator and 1 output neuron. The loss function I used is MSE. The optimizer in the code is SGD, and I also tried ADAM which could also yield constant loss values on the validation set.
The above error (constant val loss) occurs also with different learning rates and with ADAM. Sometimes the same learning rate yields not constant val loss but reasonable loss. I assume this is due to the randomized weights initialization method on the dense layers in my classificator. I even tried absurd learning_rates like 10, and values are still constant. If the lr is very high then changes should be clearly seen! This is not the case. What is wrong?
My code:
import os
from typing import Dict, Any
from PIL import Image
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras import layers
from tensorflow import keras
import matplotlib.pyplot as plt
import pickle
import numpy as np
import cv2
import random
#method to create the model
def create_model(IMG_SIZE, lr):
#Limit memore usage of GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
tf.config.experimental.set_virtual_device_configuration(gpus[0], [
except RuntimeError as e:
model = keras.Sequential()
model.add(MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))
model.layers[0].trainable = False
model.add(layers.Dense(128, activation="relu"))
model.add(layers.Dense(1, activation="relu"))
#use adam or sgd as optimizers
adam = tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.98,
sgd = tf.keras.optimizers.SGD(lr=lr, decay=1e-6, momentum=0.5)
return model
def loadImages(IMG_SIZE):
path = os.path.join(os.getcwd(), 'data\\Images')
labelMap = getLabelMap()
for img in os.listdir(path):
out_array = np.zeros((350,350, 3), np.float32) #original size of images in the dataset
img_array = cv2.imread(os.path.join(path, img))
img_array=img_array.astype('float32') #cast to float because to prevent normalization erros
out_array = cv2.normalize(img_array, out_array, 0, 1, cv2.NORM_MINMAX) #normalize image
out_array = cv2.resize(out_array, (IMG_SIZE, IMG_SIZE)) #resize, bc we need 224x224 for Imagenet pretrained weights
training_data.append([out_array, float(labelMap[img])])
except Exception as e:
return training_data
#preprocessing, the txt file All_labels.txt has lines of the form 'filename.jpg 3.2' and 3.2 is the label
def getLabelMap():
map = {}
path = os.getcwd()
path = os.path.join(path, "data\\train_test_files\\All_labels.txt")
f = open(path, "r")
for line in f:
line = line.split()
map[line[0]] = line[1]
return map
#not important, in case you want to see the images after preprocessing
def showimg(image):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#pickle the preprocessed data
def pickle_it(training_set, IMG_SIZE):
X = []
Y = []
for features, label in training_set:
X = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y = np.array(Y)
pickle_out = open("X.pickle", "wb")
pickle.dump(X, pickle_out)
pickle_out = open("Y.pickle", "wb")
pickle.dump(Y, pickle_out)
#for prediction after training the model
def betterThan(y, Y):
cnt = 0
for z in Z:
if z>y:
cnt = cnt+1
return float(cnt/len(Y))
#for prediction after training the model
def predictImage(image, model, Y):
img_array = cv2.imread(image)
img_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
img_array = np.array(img_array).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
y = model.predict(img_array)
per = betterThan(y, Y)
print('You look better than ' + str(per) + '% of the dataset')
#Main/Driver function
IMG_SIZE = 224
training_set = loadImages(IMG_SIZE)
pickle_it(training_set, IMG_SIZE) #I pickle my data, so that I don't always have to go through the preprocessing
#Load preprocessed data
X = pickle.load(open("X.pickle", "rb"))
Y = pickle.load(open("Y.pickle", "rb"))
#Just to check that the images look correct
# define the grid search parameters, feel free to edit the grids
batch_size = [64]
epochsGrid = [300]
learning_rate = [0.1]
#save models and best parameters found in grid search
size_histories = {}
min_val_loss = 10
best_para = {}
#ignore this, used for bugs on my gpu... You possibly don't need this
config = tf.compat.v1.ConfigProto(gpu_options=tf.compat.v1.GPUOptions(allow_growth=True))
sess = tf.compat.v1.Session(config=config)
#grid search, training the model
for epochs in epochsGrid:
for batch in batch_size:
for lr in learning_rate:
model = create_model(IMG_SIZE, lr)
model_name = str(epochs) + '_' + str(batch) + '_' + str(lr)
#train the model with the given hyperparameters
size_histories[model_name] = model.fit(X, Y, batch_size=batch, epochs=epochs, validation_split=0.1)
# save model with the best loss value
if min(size_histories[model_name].history['val_loss']) < min_val_loss:
min_val_loss = min(size_histories[model_name].history['val_loss'])
best_para['epoch'] = epochs
best_para['batch'] = batch
best_para['lr'] = lr
#If you want to make prediction
model = tf.keras.models.load_model("savedModel")
image = os.path.join(os.getcwd(), 'data\\otherImages\\beautifulWomen.jpg')
predictImage(image, model, Y)
I have found the issue. It is 'relu' in the output neuron. When I change my loss from RMSE to MAPE I will see that I got a 100 percent error on validation. I assume this is because all my validation data is output to 0. This is only possible when the value in the output neuron before 'relu' is negative. I don't know why this is the case. But removing 'relu' will yield better training.
Does anyone know why 'relu' causes this problem with regression problems?
If this is your last layer
model.add(layers.Dense(1, activation="relu"))
then your models final output is y if y > 0 else 0. At your untrained state, your model could very well have y pinned to something like -17 or 17 with fairly equal chance. In the case of -17, the relu will convert that to 0 and also set the gradient to 0, which means the network doesn't learn. Yeah, the network doesn't learn anything from any part of a network where a relu unit output 0. In the case of the layer before
model.add(layers.Dense(128, activation="relu"))
there will be a really good chance that about half of the units will fire with a positive value and so they learn, so that layer is fine.
What can be done in the case of a bad initialization or after training a bad state in which the output of that last layer is pushed down to below 0? Well, what if we just don't use relu. What activation to use? None! Let's look at what that would be
1: model = keras.Sequential()
2: model.add(MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))
3: model.layers[0].trainable = False
4: model.add(layers.GlobalAveragePooling2D())
5: model.add(tf.keras.layers.Dropout(0.8))
6: model.add(layers.Dense(128, activation="relu"))
7: model.add(layers.Dense(1))
Lines 1-6 are all the same. It is important to note that the output of line 6 passes through the non-linear relu activation, and so there is the capability to learn non-linearities. Line 7, without an activation function will be a linear combination of Line 6, with a full ability to generate gradients in the positive and negative output region. When backprop is applied to learn the target values of 1 to 5, if the network outputs -17, it can learn to output a larger number. Yeah!
If you'd like to have 2 layers of nonlinearity, I'd suggest the following
1: model = keras.Sequential()
2: model.add(MobileNetV2(input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))
3: model.layers[0].trainable = False
4: model.add(layers.GlobalAveragePooling2D())
5: model.add(layers.Dense(128, activation="tanh"))
6: model.add(layers.Dense(64, activation="tanh"))
7: model.add(layers.Dense(1))
Ditch the dropout unless you have actual proof that it helps in this very specific network (and right now I suspect you don't). Try tanh as your hidden layer activation function. It has some nice features, like being positive and negative, gradient even with large and/or negative numbers, and acts somewhat to automatically regularize weights. But, importantly, the last output either has no activation function.

No variation in accuracy and loss for the CNN?

I tried to classify images of 45 classes of 700 images each and perform simple CNN classification with two layers: of batch size: 252, epoch: 30, learning rate: 0.0001, Image size: 256 by 256 by3. I tried to increase as well as decrease the learning rate. Also the data set was split in the ratio 08:0.1:0.1 for training:testing:validation. However the accuracy and loss remains unchanged the loss is always zero. This is the architecture:
#The FLAGS are used to assign constant values to several paths as well as variables that will be constantly used.
flags = tf.app.flags
flags.DEFINE_float('validation_size', 0.1, 'Float: The proportion of examples in the dataset to be used for validation')
flags.DEFINE_float('test_size', 0.1, 'Float: The proportion of examples in the dataset to be used for test')
flags.DEFINE_integer('num_shards', 1, 'Int: Number of shards to split the TFRecord files into')
flags.DEFINE_integer('random_seed', 0, 'Int: Random seed to use for repeatability.')
flags.DEFINE_string('tfrecord_filename', None, 'String: The output filename to name your TFRecord file')
tf.app.flags.DEFINE_integer('target_image_height', 256, 'train input image height')
tf.app.flags.DEFINE_integer('target_image_width', 256, 'train input image width')
tf.app.flags.DEFINE_integer('batch_size', 252, 'batch size of training.')
tf.app.flags.DEFINE_integer('num_epochs', 30, 'epochs of training.')
tf.app.flags.DEFINE_float('learning_rate', 0.0001, 'learning rate of training.')
img_size = 256
datapath_train = '//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//train//None_train_00000-of-00001.tfrecord'
datapath_validation = '//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//validation//None_validation_00000-of-00001.tfrecord'
datapath_test = '//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//test//None_test_00000-of-00001.tfrecord'
def _extract_fn(tfrecord):
'image/encoded': tf.FixedLenFeature([], tf.string),
'image/format': tf.FixedLenFeature([], tf.string),
'image/class/label': tf.FixedLenFeature([], tf.int64),
'image/height': tf.FixedLenFeature([], tf.int64),
'image/width': tf.FixedLenFeature([], tf.int64),
'image/channels': tf.FixedLenFeature([],tf.int64)
parsed_example = tf.parse_single_example(tfrecord, features)
image_de = tf.io.decode_raw(parsed_example['image/encoded'],tf.uint8)
img_height = tf.cast(parsed_example['image/height'],tf.int32)
img_width = tf.cast(parsed_example['image/width'],tf.int32)
img_channel = tf.cast(parsed_example['image/channels'],tf.int32)
img_shape = tf.stack([img_height,img_width,img_channel])
label = tf.cast(parsed_example['image/class/label'],tf.int64)
image = tf.reshape(image_de,img_shape)
#label = parsed_example['image/class/label']
return image, img_shape, label
# Pipeline of dataset and iterator
dataset = tf.data.TFRecordDataset(datapath)
# Parse the record into tensors.
dataset = dataset.map(_extract_fn)
# Generate batches
dataset = dataset.batch(1)
# Create a one-shot iterator
iterator = dataset.make_one_shot_iterator()
image, img_shape, label = iterator.get_next()
with tf.Session() as sess:
except tf.errors.OutOfRangeError:
#Layer 1
filter_size_conv1 = [5,5]
num_filters_conv1 = 32
filter_shape_pool1 = [2,2]
#Layer 2
filter_size_conv2 = [3,3]
num_filters_conv2 = 64
filter_shape_pool2 = [2,2]
x = tf.placeholder(tf.float32, shape = [None, img_size,img_size,num_channels], name='x')
y = tf.placeholder(tf.int32, shape= [None], name = 'ytrue') #Output data placeholder
y_one_hot = tf.one_hot(y,45)
y_true_cls = tf.argmax(y_one_hot, dimension=1)
def new_conv_layer(input, num_input_channels, filter_size, num_filters, name):
with tf.variable_scope(name) as scope:
# Shape of the filter-weights for the convolution
shape = [filter_size, filter_size, num_input_channels, num_filters]
# Create new weights (filters) with the given shape
weights = tf.Variable(tf.truncated_normal(shape, stddev=0.05))
# Create new biases, one for each filter
biases = tf.Variable(tf.constant(0.05, shape=[num_filters]))
# TensorFlow operation for convolution
layer = tf.nn.conv2d(input=input, filter=weights, strides=[1, 1, 1, 1], padding='SAME')
# Add the biases to the results of the convolution.
layer += biases
return layer, weights
def new_pool_layer(input, name):
with tf.variable_scope(name) as scope:
# TensorFlow operation for convolution
layer = tf.nn.max_pool(value=input, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
return layer
def new_relu_layer(input, name):
with tf.variable_scope(name) as scope:
# TensorFlow operation for convolution
layer = tf.nn.relu(input)
return layer
def new_fc_layer(input, num_inputs, num_outputs, name):
with tf.variable_scope(name) as scope:
# Create new weights and biases.
weights = tf.Variable(tf.truncated_normal([num_inputs, num_outputs], stddev=0.05))
biases = tf.Variable(tf.constant(0.05, shape=[num_outputs]))
# Multiply the input and weights, and then add the bias-values.
layer = tf.matmul(input, weights) + biases
return layer
layer_conv1, weights_conv1 = new_conv_layer(input=x, num_input_channels=3, filter_size=5, num_filters=32, name ="conv1")
# Pooling Layer 1
layer_pool1 = new_pool_layer(layer_conv1, name="pool1")
# RelU layer 1
layer_relu1 = new_relu_layer(layer_pool1, name="relu1")
layer_conv2, weights_conv2 = new_conv_layer(input=layer_pool1, num_input_channels=32, filter_size=3, num_filters=64, name= "conv2")
# Pooling Layer 2
layer_pool2 = new_pool_layer(layer_conv2, name="pool2")
# RelU layer 2
layer_relu2 = new_relu_layer(layer_pool2, name="relu2")
num_features = layer_relu2.get_shape()[1:4].num_elements()
layer_flat = tf.reshape(layer_pool2, [-1, num_features])
layer_fc1 = new_fc_layer(layer_flat, num_inputs=num_features, num_outputs=1000, name="fc1")
# RelU layer 3
layer_relu3 = new_relu_layer(layer_fc1, name="relu3")
layer_fc2 = new_fc_layer(input=layer_relu3, num_inputs=1000, num_outputs=45, name="fc2")
# Use Softmax function to normalize the output
with tf.variable_scope("Softmax"):
y_pred = tf.nn.softmax(layer_fc2)
y_pred_cls = tf.argmax(y_pred, dimension = 1)
# Use Cross entropy cost function
with tf.name_scope("cross_ent"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer_fc2, labels = y_pred)
cost = tf.reduce_mean(cross_entropy)
# Use Adam Optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate = FLAGS.learning_rate).minimize(cost)
# Accuracy
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# setup the initialisation operator
init_op = tf.global_variables_initializer()
# Pipeline of dataset and iterator
dataset_train = tf.data.TFRecordDataset(datapath_train)
dataset_validation = tf.data.TFRecordDataset(datapath_validation)
dataset_test = tf.data.TFRecordDataset(datapath_test)
# Parse the record into tensors.
dataset_train = dataset_train.map(_extract_fn)
dataset_validation = dataset_validation.map(_extract_fn)
dataset_test = dataset_test.map(_extract_fn)
# Generate batches
dataset_train = dataset_train.batch(FLAGS.batch_size)
iterator_train = dataset_train.make_initializable_iterator()
next_element_train = iterator_train.get_next()
dataset_validation = dataset_validation.batch(FLAGS.batch_size)
iterator_validation = dataset_validation.make_initializable_iterator()
next_element_validation = iterator_validation.get_next()
dataset_test = dataset_test.batch(FLAGS.batch_size)
iterator_test = dataset_test.make_initializable_iterator()
next_element_test = iterator_test.get_next()
print('\n Starting the CNN train')
# Initialize the FileWriter
writer = tf.summary.FileWriter("Training_FileWriter/")
# create a summary for our cost and accuracy
train_cost_summary = tf.summary.scalar("train_cost", cost)
train_acc_summary = tf.summary.scalar("train_accuracy", accuracy)
test_cost_summary = tf.summary.scalar("test_cost", cost)
test_acc_summary = tf.summary.scalar("test_accuracy", accuracy)"""
with tf.Session() as sess:
# Add the model graph to TensorBoard
# Loop over number of epochs
for epoch in range(FLAGS.num_epochs):
start_time = time.time()
"""train_accuracy = 0
validation_accuracy = 0
acc_train_avg = 0
val_acc_avg = 0"""
for batch in range(0, int(25200/FLAGS.batch_size)):
img_train, shp_train, lbl_train = sess.run(next_element_train)
#_, loss_train, acc_train, _train_cost_summary, _train_acc_summary = sess.run([optimizer, cost, accuracy, train_cost_summary, train_acc_summary], feed_dict = {x: img_train, y: lbl_train})
_, loss_train, acc_train = sess.run([optimizer, cost, accuracy], feed_dict = {x: img_train, y: lbl_train})
#writer.add_summary(_train_cost_summary, epoch +1)
#writer.add_summary(_train_acc_summary, epoch +1)
end_time = time.time()
#acc_train_avg = (train_accuracy/(int(25200/FLAGS.batch_size)))
print("Epoch "+str(epoch+1)+" completed : Time usage "+str(int(end_time-start_time))+" seconds")
print("\t- Training Loss:\t{}", loss_train)
print ("\t- Training Accuracy:\t{}",acc_train)
The output after training is as shown below:
Epoch 1 completed : Time usage 122 seconds
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
Epoch 2 completed : Time usage 120 seconds
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
Epoch 3 completed : Time usage 120 seconds
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
Epoch 4 completed : Time usage 120 seconds
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
There is no learning of the model. I have inspected several times, the logic seems to be ok. What could be the probable reason why this is constant even after changing the learning rate, epoch and also i have tried to generate several datasets.
You have made a mistake in cross_entropy, where you are comparing the output with itself.
# Use Cross entropy cost function
with tf.name_scope("cross_ent"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer_fc2, labels = y_pred)
Try this
# Use Cross entropy cost function
with tf.name_scope("cross_ent"):
# y_actual should be one-hot labeled vector
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer_fc2, labels = y_actual)

L2 regularization keep increasing during training

I am finetuning InceptionResnetV2 on TensorFlow. When training, the regularization loss keep linearly increasing and even much larger than cross entropy loss in the later stage of training. I have checked the training procedure, and make sure I am optimizing the cross entropy loss and L2 loss combined.
Is there anyone explain this weird thing a little bit? Any feedback is appreciated.
Here is the code and some TensorBoard plots.
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
from preprocessing import aug_parallel_v2
import numpy as np
slim = tf.contrib.slim
# total training data number
sample_num = 625020
data_path = 'iNaturalist_train.tfrecords'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log_v5'
# tensorboard visualization path
filewriter_path = './filewriter_v5_Logits'
# State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
checkpoint_save_addr = './log_v5/fine-tuning_v5.ckpt'
# State the image size you're resizing your images to. We will use the default inception size of 299.
image_size = 299
# State the number of classes to predict:
num_classes = 8142
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 5
# State your batch size
batch_size = 60
# Learning rate information and configuration
initial_learning_rate = 0.0005
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 2
# put weight on different classes inversely proportional
# to total number of their image samples
label_count = np.loadtxt('label_count.txt', dtype=int)
inverse = lambda t: 1 / t
vfunc = np.vectorize(inverse)
multiplier = vfunc(label_count)
multiplier /= np.mean(multiplier)
def run():
if not os.path.exists(log_dir):
feature = {'train/height': tf.FixedLenFeature([], tf.int64),
'train/width': tf.FixedLenFeature([], tf.int64),
'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64),
'train/sup_label': tf.FixedLenFeature([], tf.int64),
'train/aug_level': tf.FixedLenFeature([], tf.int64)}
# create a list of file names
filename_queue = tf.train.string_input_producer([data_path], num_epochs=None)
reader = tf.TFRecordReader()
_, tfrecord_serialized = reader.read(filename_queue)
features = tf.parse_single_example(tfrecord_serialized, features=feature)
# Convert the image data from string back to the numbers
height = tf.cast(features['train/height'], tf.int64)
width = tf.cast(features['train/width'], tf.int64)
# change this line for your TFrecord version
tf_image = tf.image.decode_jpeg(features['train/image'])
tf_label = tf.cast(features['train/label'], tf.int32)
aug_level = tf.cast(features['train/aug_level'], tf.int32)
# tf_sup_label = tf.cast(features['train/sup_label'], tf.int64)
tf_image = tf.reshape(tf_image, tf.stack([height, width, 3]))
tf_label = tf.reshape(tf_label, [1])
aug_level = tf.reshape(aug_level, [1])
resized_image = tf.image.resize_images(images=tf_image, size=tf.constant([400, 400]), method=2)
resized_image = tf.cast(resized_image, tf.uint8)
tf_images, tf_labels, tf_aug = tf.train.shuffle_batch([resized_image, tf_label, aug_level], batch_size=batch_size,
capacity=2048, num_threads=16, allow_smaller_final_batch=False,
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
images = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3])
labels = tf.placeholder(dtype=tf.int32, shape=[None, 1])
weighted_level = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(sample_num / batch_size)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(images, num_classes=num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
print("label test")
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = tf.squeeze(tf.one_hot(labels, num_classes), [1])
weighted_onehot = tf.multiply(one_hot_labels, weighted_level)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
digits_loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_onehot, logits=logits)
reg_loss = tf.losses.get_regularization_loss()
total_loss = digits_loss + reg_loss
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
# train_vars = []
# Now we can define the optimizer that takes on the learning rate
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
# RMSProp or Adam
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer, variables_to_train=train_vars)
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.metrics.accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
tf.summary.scalar('losses/Reg_Loss', reg_loss)
tf.summary.scalar('losses/Digit_Loss', digits_loss)
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
writer = tf.summary.FileWriter(filewriter_path)
my_summary_op = tf.summary.merge_all()
def train_step(sess, train_op, global_step, imgs, lbls, weight):
Simply runs a session for the three arguments provided and gives a logging on the time elapsed
for each global step
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: digit_loss: %.4f (%.2f sec/step)',
global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
saver_pretrain = tf.train.Saver(variables_to_restore)
saver_train = tf.train.Saver(train_vars)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
# Create a coordinator and run all QueueRunner objects
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
saver_pretrain.restore(sess, checkpoint_file)
start_time = time.time()
for step in range(int(num_steps_per_epoch * num_epochs)):
imgs, lbls, augs = sess.run([tf_images, tf_labels, tf_aug])
imgs, lbls = aug_parallel_v2(imgs, lbls, augs)
imgs = imgs[:, 50:349, 50:349, :]
imgs = 2*(imgs.astype(np.float32)) - 1
lbls = lbls.astype(np.int32)
weight = multiplier[lbls]
weight = np.array(weight).reshape((batch_size, 1))
# print(imgs[0, 0:10, 0:10, 0:2])
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
print('logits: \n', logits_value)
print('Probabilities: \n', probabilities_value)
print('predictions: \n', predictions_value)
print('Labels:\n:', labels_value)
# Log the summaries every 10 step.
if step % 20 == 0:
loss, global_step_count = train_step(sess, train_op, global_step, imgs, lbls, weight)
summaries = sess.run(my_summary_op, feed_dict={images: imgs, labels: lbls, weighted_level: weight})
writer.add_summary(summaries, global_step_count)
# sess.summary_computed(sess, summaries)
# If not, simply run the training step
loss, _ = train_step(sess, train_op, global_step, imgs, lbls, weight)
if step % 2000 == 0:
logging.info('Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
print('one batch time: ', time.time() - start_time)
start_time = time.time()
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
# Stop the threads
# Wait for threads to stop
if __name__ == '__main__':
I am new here, and don't have enough reputation to post images.
Here are two links for the accuracy plot and losses plot. You can easily tell the regularization loss is in a dominant position.
This is a difficult question to answer. I can give some pointers though.
In general, when you try to minimize digits_loss, that is to fit your model to your data, you will slowly change the weights in your layers. To counter potential overfitting, a L2 regularization loss (the sum of the squares of all weights, reg_loss in your code) is generally added to the overall loss (total_loss in your code.) These two forces generally act against each other and if the balance is right, you train a good model.
In your case you're taking a network (resnet_v2) that was developed for 1,001 classes and try to predict 8,142 classes. No problem with that per se, but you're upsetting the balance. So I believe you need to override the default weight decay of 0.00004 for resnet v2 to some higher value, in this line (note only 3 zeros in the decimals for a 10x increase):
with slim.arg_scope( inception_resnet_v2_arg_scope( weight_decay = 0.0004 ) ):
A higher weight_decay parameter will force the L2 loss to decrease faster. The problem is that this number is just a guess, I have no idea what an ideal value would be. You need to experiment with multiple values and figure it out.

No classification done after passing an image to the model in Tensorflow

I am trying to pass an image to the model that i have created by following the 2_fullyconnected.ipynb udacity assignment.
The code in which i have created the model is shown below .
# In[1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
# First reload the data we generated in `1_notmnist.ipynb`.
# In[2]:
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
# Reformat into a shape that's more adapted to the models we're going to train:
# - data as a flat matrix,
# - labels as float 1-hot encodings.
# In[3]:
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
#stochastic gradient descent training
# In[7]:
batch_size = 128
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_labels]),name = "weights")
biases = tf.Variable(tf.zeros([num_labels]),name ="biases")
# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(
tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
# In[9]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
# Let's run it:
# In[10]:
num_steps = 3001
with tf.Session(graph=graph) as session:
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
save_path = tf.train.Saver().save(session, "/tmp/important_model/model.ckpt")
print("Model saved in file: %s" % save_path)
The model is saved in /tmp/important_model/.
Tree structure for that folder is as follows:
|-- checkpoint
|-- model.ckpt
`-- model.ckpt.meta
Now i am creating a new file in which i am trying to restore my model and then pass an image to the model for classification .
I have created the graph in the new python file as well , which is necessary for restoring the model (I think, I could be wrong. please correct me if i am wrong).
# In[16]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
from scipy import ndimage
# In[17]:
image_size = 28
num_labels = 10
# In[25]:
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
#train_subset = 1000
batch_size = 1
graph = tf.Graph()
with graph.as_default():
# Variables.
# These are the parameters that we are going to be training. The weight
# matrix will be initialized using random valued following a (truncated)
# normal distribution. The biases get initialized to zero.
# Variables.
#saver = tf.train.Saver()
weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_labels]),name = "weights")
biases = tf.Variable(tf.zeros([num_labels]),name ="biases")
tf_valid_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
valid_prediction = tf.nn.softmax(
tf.matmul(tf_valid_dataset, weights) + biases)
# In[26]:
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
# In[34]:
pixel_depth = 255
image_data = (ndimage.imread('notMNIST_small/A/QXJyaWJhQXJyaWJhU3RkLm90Zg==.png').astype(float) -
pixel_depth / 2) / pixel_depth
resized_data = image_data.reshape((-1,784))
with tf.Session(graph=graph) as session:
tf.train.Saver().restore(session, "/tmp/important_model/model.ckpt")
print("Model restored.")
When i am executing ln[34] in this ipython notebookthe output that is coming is :
(28, 28)
(1, 784)
Model restored
I want to tell the 5 probable labels which the given image may belong to but don't know how to do it , The above program doesn't shows any error but neither shows the desired output . I thought i will get the probabilities of the image being in all classes as i have passed my image in tf.nn.softmax function but unfortunately not getting anything .
Any help would be appreciated.
The following line in your code computes a probability distribution across the possible output labels for each image in your data set (in this case a single image):
The result of this method is a NumPy array of shape (1, 10). To see the probabilities, you can simply print the array:
result = session.run(valid_prediction,feed_dict={tf_valid_dataset:resized_data})
There are many ways that you can get the top k predictions for your image. One of the easiest is to use TensorFlow's tf.nn.top_k() operator when defining your graph:
valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
top_5_labels = tf.nn.top_k(valid_prediction, k=5)
# ...
result = session.run(top_5_labels, feed_dict={tf_valid_dataset: resized_data})