No variation in accuracy and loss for the CNN? - tensorflow

I tried to classify images of 45 classes of 700 images each and perform simple CNN classification with two layers: of batch size: 252, epoch: 30, learning rate: 0.0001, Image size: 256 by 256 by3. I tried to increase as well as decrease the learning rate. Also the data set was split in the ratio 08:0.1:0.1 for training:testing:validation. However the accuracy and loss remains unchanged the loss is always zero. This is the architecture:
#The FLAGS are used to assign constant values to several paths as well as variables that will be constantly used.
flags = tf.app.flags
flags.DEFINE_string('dataset_dir','//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//','//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//')
flags.DEFINE_float('validation_size', 0.1, 'Float: The proportion of examples in the dataset to be used for validation')
flags.DEFINE_float('test_size', 0.1, 'Float: The proportion of examples in the dataset to be used for test')
flags.DEFINE_integer('num_shards', 1, 'Int: Number of shards to split the TFRecord files into')
flags.DEFINE_integer('random_seed', 0, 'Int: Random seed to use for repeatability.')
flags.DEFINE_string('tfrecord_filename', None, 'String: The output filename to name your TFRecord file')
tf.app.flags.DEFINE_integer('target_image_height', 256, 'train input image height')
tf.app.flags.DEFINE_integer('target_image_width', 256, 'train input image width')
tf.app.flags.DEFINE_integer('batch_size', 252, 'batch size of training.')
tf.app.flags.DEFINE_integer('num_epochs', 30, 'epochs of training.')
tf.app.flags.DEFINE_float('learning_rate', 0.0001, 'learning rate of training.')
FLAGS = flags.FLAGS
img_size = 256
num_channels=3
num_classes=45
########################################################################################################################
########################################################################################################################
datapath_train = '//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//train//None_train_00000-of-00001.tfrecord'
datapath_validation = '//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//validation//None_validation_00000-of-00001.tfrecord'
datapath_test = '//media//datapart//akshara//NWPU-RESISC45//NWPU-RESISC45//test//None_test_00000-of-00001.tfrecord'
def _extract_fn(tfrecord):
features={
'image/encoded': tf.FixedLenFeature([], tf.string),
'image/format': tf.FixedLenFeature([], tf.string),
'image/class/label': tf.FixedLenFeature([], tf.int64),
'image/height': tf.FixedLenFeature([], tf.int64),
'image/width': tf.FixedLenFeature([], tf.int64),
'image/channels': tf.FixedLenFeature([],tf.int64)
}
parsed_example = tf.parse_single_example(tfrecord, features)
image_de = tf.io.decode_raw(parsed_example['image/encoded'],tf.uint8)
img_height = tf.cast(parsed_example['image/height'],tf.int32)
img_width = tf.cast(parsed_example['image/width'],tf.int32)
img_channel = tf.cast(parsed_example['image/channels'],tf.int32)
img_shape = tf.stack([img_height,img_width,img_channel])
label = tf.cast(parsed_example['image/class/label'],tf.int64)
image = tf.reshape(image_de,img_shape)
#label = parsed_example['image/class/label']
return image, img_shape, label
########################################################################################################################
#########################################################################################################################
"""
# Pipeline of dataset and iterator
dataset = tf.data.TFRecordDataset(datapath)
# Parse the record into tensors.
dataset = dataset.map(_extract_fn)
# Generate batches
dataset = dataset.batch(1)
# Create a one-shot iterator
iterator = dataset.make_one_shot_iterator()
image, img_shape, label = iterator.get_next()
with tf.Session() as sess:
try:
print(sess.run(img_shape))
image_batch=sess.run(image)
print(image_batch)
img_bas=tf.cast(image_batch,tf.uint8)
plt.imshow(image_batch[0,:,:,:]*255)
plt.show()
except tf.errors.OutOfRangeError:
pass"""
########################################################################################################################
########################################################################################################################
#INITIALIZATION FOR THE CNN ARCHITECTURE
#Layer 1
filter_size_conv1 = [5,5]
num_filters_conv1 = 32
filter_shape_pool1 = [2,2]
#Layer 2
filter_size_conv2 = [3,3]
num_filters_conv2 = 64
filter_shape_pool2 = [2,2]
#Placeholders
x = tf.placeholder(tf.float32, shape = [None, img_size,img_size,num_channels], name='x')
y = tf.placeholder(tf.int32, shape= [None], name = 'ytrue') #Output data placeholder
y_one_hot = tf.one_hot(y,45)
y_true_cls = tf.argmax(y_one_hot, dimension=1)
########################################################################################################################
########################################################################################################################
def new_conv_layer(input, num_input_channels, filter_size, num_filters, name):
with tf.variable_scope(name) as scope:
# Shape of the filter-weights for the convolution
shape = [filter_size, filter_size, num_input_channels, num_filters]
# Create new weights (filters) with the given shape
weights = tf.Variable(tf.truncated_normal(shape, stddev=0.05))
# Create new biases, one for each filter
biases = tf.Variable(tf.constant(0.05, shape=[num_filters]))
# TensorFlow operation for convolution
layer = tf.nn.conv2d(input=input, filter=weights, strides=[1, 1, 1, 1], padding='SAME')
# Add the biases to the results of the convolution.
layer += biases
return layer, weights
def new_pool_layer(input, name):
with tf.variable_scope(name) as scope:
# TensorFlow operation for convolution
layer = tf.nn.max_pool(value=input, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
return layer
def new_relu_layer(input, name):
with tf.variable_scope(name) as scope:
# TensorFlow operation for convolution
layer = tf.nn.relu(input)
return layer
def new_fc_layer(input, num_inputs, num_outputs, name):
with tf.variable_scope(name) as scope:
# Create new weights and biases.
weights = tf.Variable(tf.truncated_normal([num_inputs, num_outputs], stddev=0.05))
biases = tf.Variable(tf.constant(0.05, shape=[num_outputs]))
# Multiply the input and weights, and then add the bias-values.
layer = tf.matmul(input, weights) + biases
return layer
# CONVOLUTIONAL LAYER 1
layer_conv1, weights_conv1 = new_conv_layer(input=x, num_input_channels=3, filter_size=5, num_filters=32, name ="conv1")
# Pooling Layer 1
layer_pool1 = new_pool_layer(layer_conv1, name="pool1")
# RelU layer 1
layer_relu1 = new_relu_layer(layer_pool1, name="relu1")
# CONVOLUTIONAL LAYER 2
layer_conv2, weights_conv2 = new_conv_layer(input=layer_pool1, num_input_channels=32, filter_size=3, num_filters=64, name= "conv2")
# Pooling Layer 2
layer_pool2 = new_pool_layer(layer_conv2, name="pool2")
# RelU layer 2
layer_relu2 = new_relu_layer(layer_pool2, name="relu2")
# FLATTENED LAYER
num_features = layer_relu2.get_shape()[1:4].num_elements()
layer_flat = tf.reshape(layer_pool2, [-1, num_features])
# FULLY-CONNECTED LAYER 1
layer_fc1 = new_fc_layer(layer_flat, num_inputs=num_features, num_outputs=1000, name="fc1")
# RelU layer 3
layer_relu3 = new_relu_layer(layer_fc1, name="relu3")
# FULLY-CONNECTED LAYER 2
layer_fc2 = new_fc_layer(input=layer_relu3, num_inputs=1000, num_outputs=45, name="fc2")
# Use Softmax function to normalize the output
with tf.variable_scope("Softmax"):
y_pred = tf.nn.softmax(layer_fc2)
y_pred_cls = tf.argmax(y_pred, dimension = 1)
# Use Cross entropy cost function
with tf.name_scope("cross_ent"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer_fc2, labels = y_pred)
cost = tf.reduce_mean(cross_entropy)
# Use Adam Optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate = FLAGS.learning_rate).minimize(cost)
# Accuracy
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# setup the initialisation operator
init_op = tf.global_variables_initializer()
# Pipeline of dataset and iterator
dataset_train = tf.data.TFRecordDataset(datapath_train)
dataset_validation = tf.data.TFRecordDataset(datapath_validation)
dataset_test = tf.data.TFRecordDataset(datapath_test)
# Parse the record into tensors.
dataset_train = dataset_train.map(_extract_fn)
dataset_validation = dataset_validation.map(_extract_fn)
dataset_test = dataset_test.map(_extract_fn)
# Generate batches
dataset_train = dataset_train.batch(FLAGS.batch_size)
iterator_train = dataset_train.make_initializable_iterator()
next_element_train = iterator_train.get_next()
dataset_validation = dataset_validation.batch(FLAGS.batch_size)
iterator_validation = dataset_validation.make_initializable_iterator()
next_element_validation = iterator_validation.get_next()
dataset_test = dataset_test.batch(FLAGS.batch_size)
iterator_test = dataset_test.make_initializable_iterator()
next_element_test = iterator_test.get_next()
print('\n Starting the CNN train')
# Initialize the FileWriter
writer = tf.summary.FileWriter("Training_FileWriter/")
"""
# create a summary for our cost and accuracy
train_cost_summary = tf.summary.scalar("train_cost", cost)
train_acc_summary = tf.summary.scalar("train_accuracy", accuracy)
test_cost_summary = tf.summary.scalar("test_cost", cost)
test_acc_summary = tf.summary.scalar("test_accuracy", accuracy)"""
#PERFORM THE CNN OPERATIONS
with tf.Session() as sess:
sess.run(init_op)
sess.run(iterator_test.initializer)
# Add the model graph to TensorBoard
writer.add_graph(sess.graph)
# Loop over number of epochs
print('\nTraining')
for epoch in range(FLAGS.num_epochs):
sess.run(iterator_train.initializer)
sess.run(iterator_validation.initializer)
start_time = time.time()
"""train_accuracy = 0
validation_accuracy = 0
acc_train_avg = 0
val_acc_avg = 0"""
for batch in range(0, int(25200/FLAGS.batch_size)):
img_train, shp_train, lbl_train = sess.run(next_element_train)
#_, loss_train, acc_train, _train_cost_summary, _train_acc_summary = sess.run([optimizer, cost, accuracy, train_cost_summary, train_acc_summary], feed_dict = {x: img_train, y: lbl_train})
_, loss_train, acc_train = sess.run([optimizer, cost, accuracy], feed_dict = {x: img_train, y: lbl_train})
#train_accuracy+=acc_train
#writer.add_summary(_train_cost_summary, epoch +1)
#writer.add_summary(_train_acc_summary, epoch +1)
end_time = time.time()
#acc_train_avg = (train_accuracy/(int(25200/FLAGS.batch_size)))
#TRAINING
print("Epoch "+str(epoch+1)+" completed : Time usage "+str(int(end_time-start_time))+" seconds")
print("\tAccuracy:")
print("\t- Training Loss:\t{}", loss_train)
print ("\t- Training Accuracy:\t{}",acc_train)
The output after training is as shown below:
Training
Epoch 1 completed : Time usage 122 seconds
Accuracy:
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
Validation
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
Epoch 2 completed : Time usage 120 seconds
Accuracy:
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
Validation
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
Epoch 3 completed : Time usage 120 seconds
Accuracy:
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
Validation
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
Epoch 4 completed : Time usage 120 seconds
Accuracy:
- Training Loss: {} 0.0
- Training Accuracy: {} 0.035714287
Validation
- Validation Accuracy: {} 0.035714287
Validation Loss: {} 0.0
There is no learning of the model. I have inspected several times, the logic seems to be ok. What could be the probable reason why this is constant even after changing the learning rate, epoch and also i have tried to generate several datasets.

You have made a mistake in cross_entropy, where you are comparing the output with itself.
# Use Cross entropy cost function
with tf.name_scope("cross_ent"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer_fc2, labels = y_pred)
Try this
# Use Cross entropy cost function
with tf.name_scope("cross_ent"):
# y_actual should be one-hot labeled vector
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits = layer_fc2, labels = y_actual)

Related

pytorch and tensorfelow lossfunction

I have tried getting Tensorflow and Pytorch CrossEntropyLoss but it returns different values and I don't know why. I find a solution for this problem
solution link
but i cant fix my two model
please help me
my tensorflow model feed forward neural network
model=keras.Sequential()
model.add(keras.layers.Input(shape=x_train[0].shape))
model.add(keras.layers.Dense(units=256,activation="relu", use_bias=True))
model.add(keras.layers.Dense(units=128,activation="relu", use_bias=True))
model.add(keras.layers.Dense(units=64,activation="relu", use_bias=True))
model.add(keras.layers.Dense(units=10,activation="softmax"))
epochs=15
# Compile the model
model.compile(
optimizer=tf.keras.optimizers.Adam(0.0001), # Utilize optimizer
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
# Train the network
history1 = model.fit(
x_train,
y_train,
batch_size=64,
validation_split=0.1,
epochs=epochs,callbacks=[tf.keras.callbacks.TensorBoard(
log_dir="logs/image"
)])
my pytorch model feed forward neural network
input_size = 784
hidden_sizes = [256,128, 64]
output_size = 10
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
nn.ReLU(),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.ReLU(),
nn.Linear(hidden_sizes[1], hidden_sizes[2]),
nn.ReLU(),
nn.Linear(hidden_sizes[2], output_size),
nn.Softmax())
criterion = nn.CrossEntropyLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)
logps = model(images) #log probabilities
loss = criterion(logps, labels) #calculate the NLL loss
optimizer = optim.Adam(model.parameters(), lr=0.0001)
time0 = time()
epochs = 15
for e in range(epochs):
running_loss = 0
running_loss_val = 0
for images, labels in trainloader:
# Flatten MNIST images into a 784 long vector
images = images.view(images.shape[0], -1)
optimizer.zero_grad()
output = model(images)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
print("Epoch {} - Training loss: {} - validation loss: {}".format(e, running_loss/len(trainloader), running_loss_val/len(valloader)))
```
```

Stateful LSTM VAE: Invalid argument: You must feed a value for placeholder tensor 'decoder_input' with dtype float and shape [batch_size, latent_dim]

I am solving a Timeseries problem using LSTM VAE(Variational auto-encoder), I have built my VAE model as below
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
class VAE:
def __init__(self,
hidden_layer_units,
hidden_layer_leakyrelu_alphas,
hidden_layer_dropout_rates,
batch_size,
time_steps,
num_features,
is_stateful_learning):
self.hidden_layer_units = hidden_layer_units
self.hidden_layer_leakyrelu_alphas = hidden_layer_leakyrelu_alphas
self.hidden_layer_dropout_rates = hidden_layer_dropout_rates
self.encoder_num_layers = 0
self.latent_space_dim = 0
vae_total_layers = len(hidden_layer_units)
if 0 < vae_total_layers:
self.encoder_num_layers = int((vae_total_layers - 1) / 2)
self.latent_space_dim = self.hidden_layer_units[self.encoder_num_layers]
self.batch_size = batch_size
self.time_steps = time_steps
self.num_features = num_features
self.is_stateful_learning = is_stateful_learning
self.encoder = None
self.decoder = None
self.model = None
self.model_input = None
self.model_output = None
self.mu = None
self.log_variance = None
self.kulback_coef = 0.0001
self._build()
def summary(self):
self.encoder.summary()
self.decoder.summary()
self.model.summary()
def compile(self, learning_rate=0.001):
optimizer = Adam(learning_rate=learning_rate)
self.model.compile(optimizer=optimizer,
loss=self._calculate_combined_loss,
metrics=[self._calculate_reconstruction_loss, self._calculate_kl_loss])
def _build(self):
self._build_encoder()
self._build_decoder()
self._build_autoencoder()
def _build_encoder(self):
encoder_input = self._add_encoder_input()
lstm_layers = self._add_encoder_lstm_layers(encoder_input)
bottleneck = self._add_bottleneck(lstm_layers)
self.model_input = encoder_input
self.encoder = Model(encoder_input, bottleneck, name="encoder")
def _build_decoder(self):
decoder_input = self._add_decoder_input()
repeater_layer = self._add_repeater_layer(decoder_input)
lstm_layer = self._add_decoder_lstm_layer(repeater_layer)
decoder_output = self._add_decoder_output(lstm_layer)
self.decoder = Model(decoder_input, decoder_output, name="decoder")
def _build_autoencoder(self):
model_input = self.model_input
encoder_output = self.encoder(model_input)
model_output = self.decoder(encoder_output)
self.model_output = model_output
self.model = Model(model_input, model_output, name="autoencoder")
def _add_encoder_input(self):
if self.is_stateful_learning:
x = Input(batch_shape=(self.batch_size, self.time_steps, self.num_features), name="encoder_input")
else:
x = Input(shape=(self.time_steps, self.num_features), name="encoder_input")
return x
def _add_encoder_lstm_layers(self, encoder_input):
""" Create all lstm layers in encoder."""
x = encoder_input
for layer_index, units in enumerate(self.hidden_layer_units[:self.encoder_num_layers]):
lstm_params = {}
if layer_index < self.encoder_num_layers - 1:
lstm_params["return_sequences"] = True
if self.is_stateful_learning:
lstm_params["stateful"] = True
x = LSTM(units=units, **lstm_params)(x)
x = LeakyReLU(alpha=self.hidden_layer_leakyrelu_alphas[layer_index])(x)
x = Dropout(rate=self.hidden_layer_dropout_rates[layer_index])(x)
return x
def _add_bottleneck(self, x):
""" add bottleneck with Guassian sampling (Dense layer)."""
self.mu = Dense(self.latent_space_dim, name="mu")(x)
self.log_variance = Dense(self.latent_space_dim, name="log_variance")(x)
x = Lambda(self.sample_point_from_normal_distribution, name="encoder_output")([self.mu, self.log_variance])
return x
def sample_point_from_normal_distribution(self, args):
mu, log_variance = args
epsilon = K.random_normal(shape=K.shape(mu), mean=0., stddev=1.)
sampled_point = mu + K.exp(log_variance / 2) * epsilon
return sampled_point
def _add_decoder_input(self):
if self.is_stateful_learning:
x = Input(batch_shape=(self.batch_size, self.latent_space_dim), name="decoder_input")
else:
x = Input(shape=(self.latent_space_dim), name="decoder_input")
return x
def _add_repeater_layer(self, decoder_input):
return RepeatVector(self.time_steps)(decoder_input)
def _add_decoder_lstm_layer(self, repeater_layer):
x = repeater_layer
for layer_index, units in enumerate(self.hidden_layer_units[self.encoder_num_layers + 1:]):
lstm_params = {}
if self.is_stateful_learning:
# stateful build
lstm_params = {'stateful': True, 'return_sequences': True}
else:
lstm_params["return_sequences"] = True
layer_no = layer_index + self.encoder_num_layers + 1
x = LSTM(units=units, **lstm_params)(x)
x = LeakyReLU(alpha=self.hidden_layer_leakyrelu_alphas[layer_no])(x)
x = Dropout(rate=self.hidden_layer_dropout_rates[layer_no])(x)
return x
def _add_decoder_output(self, lstm_layer):
return TimeDistributed(Dense(1))(lstm_layer)
def _calculate_combined_loss(self, y_target, y_predicted):
reconstruction_loss = self._calculate_reconstruction_loss(y_target, y_predicted)
kl_loss = self._calculate_kl_loss(y_target, y_predicted)
combined_loss = reconstruction_loss + (self.kulback_coef * kl_loss)
return combined_loss
def _calculate_reconstruction_loss(self, y_target, y_predicted):
error = y_target - y_predicted
reconstruction_loss = K.mean(K.square(error), axis=1)
return reconstruction_loss
def _calculate_kl_loss(self, y_target, y_predicted):
kl_loss = -0.5 * K.sum(1 + self.log_variance - K.square(self.mu) - K.exp(self.log_variance), axis=1)
return kl_loss
# Build Variational AutoEncoder(VAE) LSTM Model:
def build_lstm_neural_network(lstm_layer_units=[], leakyrelu_layer_alphas=[], dropout_layer_rates=[],
number_of_sequences=32, time_steps=32, data_dim=1, is_stateful_learning=False):
vae = VAE(
hidden_layer_units=lstm_layer_units,
hidden_layer_leakyrelu_alphas=leakyrelu_layer_alphas,
hidden_layer_dropout_rates=dropout_layer_rates,
batch_size=number_of_sequences,
time_steps=time_steps,
num_features=data_dim,
is_stateful_learning=is_stateful_learning
)
vae.compile(learning_rate)
vae.summary()
return vae.model
Model training block looks as below
# configuration
nn_lstm_layer_units = [160, 3, 160]
nn_leakyrelu_layer_alphas = [0.0, 0.0, 0.0]
nn_dropout_layer_rates = [0.3, 0.0, 0.3]
batch_size = 96
win_length = 64
num_features = 6 # You can use single variate Timeseries data as well, num_features = 1
epochs = 782
learning_rate = 0.0001
want_stateful_learning = True
# Build LSTM VAE model
model = build_lstm_neural_network(nn_lstm_layer_units, nn_leakyrelu_layer_alphas, nn_dropout_layer_rates, batch_size,
win_length, num_features, want_stateful_learning)
TIME_STEPS = win_length
# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
output = []
for i in range(len(values) - time_steps + 1):
output.append(values[i: (i + time_steps)])
return np.stack(output)
x_train = create_sequences(x_train)
x_val = create_sequences(x_val)
callbacks = []
unfit_train_record_count = 0
unfit_val_record_count = 0
if want_stateful_learning:
# stateful learning
# adjust train data size(should be in multiples of batch size)
unfit_train_record_count = len(x_train) % batch_size
unfit_val_record_count = len(x_val) % batch_size
# Reset states of the stateful model on epoch end
stateful_model_reset_states = LambdaCallback(on_epoch_end=lambda batch, logs: model.reset_states())
callbacks.append(stateful_model_reset_states)
early_stopping = EarlyStopping(monitor=monitor, patience=patience)
callbacks.append(early_stopping)
# Model traning
history = model.fit(x=x_train[unfit_train_record_count:], y=x_train[unfit_train_record_count:, :, [0]], validation_data=(x_val[unfit_val_record_count:], x_val[unfit_val_record_count:, :, [0]]), batch_size=batch_size, epochs=epochs, shuffle=False, callbacks=callbacks)
The stateless mode of the model is working as expected but the stateful mode is throwing an error as below-
1632/1632 [==============================] - ETA: 0s - loss: 0.2447 - _calculate_reconstruction_loss: 0.2447 - _calculate_kl_loss: 0.0326
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: You must feed a value for placeholder tensor 'decoder_input' with dtype float and shape [96,3]
[[{{node decoder_input}}]]
[[metrics/_calculate_reconstruction_loss/Identity/_229]]
(1) Invalid argument: You must feed a value for placeholder tensor 'decoder_input' with dtype float and shape [96,3]
[[{{node decoder_input}}]]
Environment used is as
Python-3.8.12,
Tensorflow-gpu: 2.5,
cudnn: 8.2.1.32
I am not clear why the stateful model run 1 Epoch for training data, but as soon as it starts to process the validation data, it throws the error.
I had the same experiences with dataset and loss function that not suitable, I try to simulate again it possible no loss value change, no loss as nan, error when validation.
That is possible no value, no match or not update neuron, you can use Tensorflow 2.x is a lot moire easier.
This is no match validation: Working on training but results in errors when validation. ( one possible )
Epoch 1/100
2022-01-23 21:04:59.846791: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
1/1 [==============================] - ETA: 0s - loss: 3.1866 - accuracy: 0.0000e+00Traceback (most recent call last):
Another possible is loss Fn no match: It is possible they are not update the neurons
Epoch 1/100
2022-01-23 21:08:23.330068: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
1/1 [==============================] - 3s 3s/step - loss: 13.7138 - accuracy: 0.2000 - val_loss: 8.2133 - val_accuracy: 0.0000e+00
Epoch 2/100
1/1 [==============================] - 0s 65ms/step - loss: 7.7745 - accuracy: 0.0000e+00 - val_loss: 8.0456 - val_accuracy: 0.0000e+00
I solved the problem, by changing the loss calculation logic, instead of defining the functions to calculate reconstruction and KL loss in the VAE class, I moved the loss calculation part outside the VAE class as below
# Build Variational AutoEncoder(VAE) LSTM Model:
def build_lstm_neural_network(lstm_layer_units=[], leakyrelu_layer_alphas=[], dropout_layer_rates=[],
number_of_sequences=32, time_steps=32, data_dim=1, is_stateful_learning=False):
vae = VAE(
hidden_layer_units=lstm_layer_units,
hidden_layer_leakyrelu_alphas=leakyrelu_layer_alphas,
hidden_layer_dropout_rates=dropout_layer_rates,
batch_size=number_of_sequences,
time_steps=time_steps,
num_features=data_dim,
is_stateful_learning=is_stateful_learning
)
# Add reconstruction loss
error = vae.model_input - vae.model_output
reconstruction_loss = K.mean(K.square(error))
vae.model.add_loss(reconstruction_loss)
vae.model.add_metric(reconstruction_loss, name='mse_loss', aggregation='mean')
# Add KL loss
kl_loss = kl_beta * K.mean(-0.5 * K.sum(1 + vae.log_variance - K.square(vae.mu) - K.exp(vae.log_variance), axis = 1), axis=0)
model.add_loss(kl_loss)
model.add_metric(kl_loss, name='kl_loss', aggregation='mean')
optimizer = Adam(learning_rate=vae.learning_rate, clipvalue=vae.clipvalue)
vae.model.compile(loss=None, optimizer=optimizer)
vae.summary()
return vae.model

Difference between valid accuracy from CategoricalAccuracy() and manual prediction

I used CategoricalAccuracy() to calculate the valid_accuracy during training and reached 0.733 in a certain round. After training, I use the model weights saved under the epoch to predict the same valid data set, and the accuracy obtained is only 0.38. Why is this?
This is my training log
I only divided the data into two sets: training set and validation set.
There are 517 images in my validation set. I use this model to make manual predictions and compare them with the true labels. Only 199 images are correctly predicted. The accuracy rate is only 0.38, which is less than 0.733.
Even if it is overfitting, I think these two values should be close.
for epoch in range(EPOCHS):
step = 0
for features in train_dataset:
step += 1
images, labels = process_features(features, data_augmentation=True)
train_step(images, labels)
print("Epoch: {}/{}, step: {}/{}, loss: {:.5f}, accuracy: {:.5f}".format(epoch,
EPOCHS,
step,
math.ceil(train_count / BATCH_SIZE),
train_loss.result().numpy(),
train_accuracy.result().numpy()))
for features in valid_dataset:
valid_images, valid_labels = process_features(features, data_augmentation=False)
valid_step(valid_images, valid_labels)
print("Epoch: {}/{}, train loss: {:.5f}, train accuracy: {:.5f}, "
"valid loss: {:.5f}, valid accuracy: {:.5f}".format(epoch,
EPOCHS,
train_loss.result().numpy(),
train_accuracy.result().numpy(),
valid_loss.result().numpy(),
valid_accuracy.result().numpy()))
def process_features(features, data_augmentation):
image_raw = features['image_raw'].numpy()
image_tensor_list = []
for image in image_raw:
image_tensor = load_and_preprocess_image(image, data_augmentation=data_augmentation)
image_tensor_list.append(image_tensor)
images = tf.stack(image_tensor_list, axis=0)
labels = features['label'].numpy()
new_labels = tf.one_hot(labels,7)
return images, new_labels
def load_and_preprocess_image(image_raw, data_augmentation=False):
# decode
image_tensor = tf.io.decode_image(contents=image_raw, channels=CHANNELS, dtype=tf.dtypes.float32)
if data_augmentation:
image = tf.image.random_flip_left_right(image=image_tensor)
image = tf.image.resize_with_crop_or_pad(image=image,
target_height=int(IMAGE_HEIGHT * 1.2),
target_width=int(IMAGE_WIDTH * 1.2))
image = tf.image.random_crop(value=image, size=[IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS])
image = tf.image.random_brightness(image=image, max_delta=0.5)
else:
image = tf.image.resize(image_tensor, [IMAGE_HEIGHT, IMAGE_WIDTH])
return image
def valid_step(image_batch, label_batch):
predictions = model(image_batch, training=False)
v_loss = loss_object(label_batch, predictions)
valid_loss.update_state(values=v_loss)
valid_accuracy.update_state(y_true=label_batch, y_pred=predictions)
The following is my code to detect each picture of validation set
import tensorflow as tf
from configuration import save_model_dir, test_image_dir
from prepare_data import load_and_preprocess_image
from train import get_model
def get_single_picture_prediction(model, picture_dir):
image_tensor = load_and_preprocess_image(tf.io.read_file(filename=picture_dir), data_augmentation=False)
image = tf.expand_dims(image_tensor, axis=0)
prediction = model(image, training=False)
pred_class = tf.math.argmax(prediction, axis=-1)
return pred_class
if __name__ == '__main__':
# GPU settings
gpus = tf.config.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# load the model
model = get_model()
model.load_weights(filepath=save_model_dir+"model")
pred_class = get_single_picture_prediction(model, test_image_dir)
print(pred_class)
This is my picture

MXNET custom symbol loss with gluon

I wrote this code,(almost are from tutorial, I just modified a few lines)
and this is not working.
from mxnet import gluon
from mxnet.gluon import nn
np.random.seed(42)
mx.random.seed(42)
ctx = mx.gpu()
def data_xform(data):
"""Move channel axis to the beginning, cast to float32, and normalize to [0, 1]."""
return nd.moveaxis(data, 2, 0).astype('float32') / 255
# prepare data
train_data = mx.gluon.data.vision.MNIST(train=True).transform_first(data_xform)
val_data = mx.gluon.data.vision.MNIST(train=False).transform_first(data_xform)
batch_size = 100
train_loader = mx.gluon.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = mx.gluon.data.DataLoader(val_data, shuffle=False, batch_size=batch_size)
# create network
data = mx.symbol.Variable('data')
fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
net= gluon.SymbolBlock(outputs=[fc3], inputs=[data])
net.initialize(ctx=ctx)
# create trainer, metric
trainer = gluon.Trainer(
params=net.collect_params(),
optimizer='sgd',
optimizer_params={'learning_rate': 0.1, 'momentum':0.9, 'wd':0.00001},
)
metric = mx.metric.Accuracy()
# learn
num_epochs = 10
for epoch in range(num_epochs):
for inputs, labels in train_loader:
inputs = inputs.as_in_context(ctx)
labels = labels.as_in_context(ctx)
with autograd.record():
outputs = net(inputs)
# softmax
exps = nd.exp(outputs - outputs.min(axis=1).reshape((-1,1)))
exps = exps / exps.sum(axis=1).reshape((-1,1))
# cross entropy
loss = nd.MakeLoss(-nd.log(exps.pick(labels)))
#
#loss = gluon.loss.SoftmaxCrossEntropyLoss()(outputs, labels)
#print(loss)
loss.backward()
metric.update(labels, outputs)
trainer.step(batch_size=inputs.shape[0])
name, acc = metric.get()
print('After epoch {}: {} = {}'.format(epoch + 1, name, acc))
metric.reset()
If I use gluon.loss.SoftmaxCrossEntropyLoss, this runs well..
When I print loss in both cases, output values are look same.
What are the differences?
Thank you for advance
I am not entirely sure, why you subtract outputs.min() when calculating softmax. Original softmax function doesn't do anything like that - https://en.wikipedia.org/wiki/Softmax_function. If you don't do that, you will get a good value of accuracy:
# softmax
exps = nd.exp(outputs)
exps = exps / exps.sum(axis=1).reshape((-1, 1))
# cross entropy
loss = nd.MakeLoss(-nd.log(exps.pick(labels)))
I get:
After epoch 1: accuracy = 0.89545
After epoch 2: accuracy = 0.9639
After epoch 3: accuracy = 0.97395
After epoch 4: accuracy = 0.9784
After epoch 5: accuracy = 0.98315

TensorFlow ValueError: Rank mismatch error

All, I am completely stuck due to an error in my code to classify Cats vs. Dogs using a Convolution network. I could use the high level libraries available these days, but for learning, I want to get this lower level working. The output is a binary classification of an image containing either a cat or a dog. I have scanned a number of Rank related threads, but unable to make out how to solve this error using sparse_softmax_cross_entropy_with_logits specifically.
If I change 2 lines; use softmax_cross_entropy_with_logits_v2() and uncomment labels = tf.argmax(y, 1), then it runs, but the Accuracy even on the train set, degrades rapidly (net diverges).
Any help would be much appreciated. Thanks.
The 2 lines I am not a 100% sure about are as follows.
Should the 1 here be n_outputs (which is 2)? (since binary but it does not seem right)
y = tf.placeholder(dtype=tf.int64, shape=[100, 1], name="y")
This is the line that throws the error: ValueError: Rank mismatch: Rank of labels (received 2) should equal rank of logits minus 1 (received 2).
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(\
labels=y, logits=logits)
Full code (from the point of having data in hand) is as below; it is long-ish but simple. I have commented out the end since the error is thrown before it gets there. Error is at the end of it below.
#---Split data into training & test sets---
# Work the data for cats and dogs numpy arrays
# These numpy arrays were generated in previous data prep work
# Stack the numpy arrays for the inputs
X_cat_dog = np.concatenate((cats_1000_64_64_1, dogs_1000_64_64_1),
axis = 0)
X_cat_dog = X_cat_dog.reshape(-1, width*height) #Flatten
# Scikit Learn for min-max scaling of the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(np.array([0., 255.]).reshape(-1,1))
X_cat_dog_scaled = scaler.transform(X_cat_dog)
# Define the labels to be used: cats = 0, dogs = 1
y_cat_dog = np.concatenate((np.zeros((1000), dtype = np.int32),
np.ones((1000), dtype = np.int32)),
axis = 0)
# Scikit Learn for random splitting of the data
from sklearn.model_selection import train_test_split
# Random split of data into training (80%) and test (20%)
X_train, X_test, y_train, y_test = \
train_test_split(X_cat_dog_scaled, y_cat_dog, test_size=0.20,
random_state = RANDOM_SEED)
print('Train orig. shape:', X_train.shape, y_train.shape)
print('Test orig. shape:', X_test.shape, y_test.shape)
#Reshape into 4D
X_train = np.reshape(X_train, newshape=[X_train.shape[0], height, width, channels])
y_train = np.reshape(y_train, newshape=[y_train.shape[0], 1])
X_test = np.reshape(X_test, newshape=[X_test.shape[0], height, width, channels])
y_test = np.reshape(y_test, newshape=[y_test.shape[0], 1])
print('Train 4D shape:', X_train.shape, y_train.shape, type(X_train), type(y_train))
print('Test 4D shape:', X_test.shape, y_test.shape, type(X_test), type(y_test))
#---Define and run convolution net---
#Init
results = [] #Summary results
reset_graph() #Else upon rerun, error occurs
n_outputs = 2 #Binary; cat or dog
n_strides = [1,2,2] #Symmetric XY + same across conv & pool
n_conv_blocks = 1 #Number of convolution blocks
n_filters = [5, 10, 20] #Number of filters applied per layer
#Placeholders for batch training
X = tf.placeholder(dtype=tf.float64,
shape=[100, height, width, channels], name="X")
y = tf.placeholder(dtype=tf.int64, shape=[100, 1], name="y")
print('X.shape =', X.shape, tf.rank(X))
print('y.shape =', y.shape, tf.rank(y))
#Define hidden layers
with tf.name_scope("cnn"):
#Create number of convolution blocks required
for block in range(n_conv_blocks):
#Convolution layer
inputLayer = X
if (block>0):
inputLayer = pool
print('\nStride:', n_strides[block])
conv = tf.layers.conv2d(inputLayer,
filters = n_filters[block],
kernel_size = 1,
strides = n_strides[block],
activation = tf.nn.leaky_relu,
padding = "SAME")
print('Conv '+str(block)+'.shape =',
conv.get_shape().as_list())
#Pooling layer
pool = tf.nn.avg_pool(conv,
ksize = [1,2,2,1],
strides = [1,n_strides[block],n_strides[block],1],
padding = "SAME")
print('Pool '+str(block)+'.shape =', pool.shape)
pool_shape = pool.get_shape().as_list()
next_width = pool_shape[1]
next_height = pool_shape[2]
next_depth = pool_shape[3]
#Fully connected
flattened = tf.reshape(pool, [-1,
next_width * next_height * next_depth])
print('\nFlattened.shape =', flattened.shape)
hidden = tf.layers.dense(flattened,
next_width * next_height * next_depth,
name="hidden1",
activation=tf.nn.leaky_relu)
print('\nHidden.shape =', hidden.shape, tf.rank(hidden))
#Output
logits = tf.layers.dense(hidden, n_outputs, name="outputs")
print('\nLogits.shape =', logits.shape, tf.rank(logits))
#Define loss function
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(\
labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
#Define optimizer used for reducing the loss; MomentumOptimizer
learning_rate = 0.01
momentum = 0.01
with tf.name_scope("train"):
optimizer = tf.train.MomentumOptimizer(learning_rate,
momentum)
training_op = optimizer.minimize(loss)
#Define performance measure; accuracy in this case
with tf.name_scope("eval"):
#labels = tf.argmax(y, 1)
labels = y
correct = tf.nn.in_top_k(logits, labels, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
#Define instantiator for TensorFlow variables
init = tf.global_variables_initializer()
#Carry out training in mini-batches
n_epochs = 1
batch_size = 100
with tf.Session() as sess:
#Instantiate variables
init.run()
#Loop over n_epochs
for epoch in range(n_epochs):
#Loop over batches
for iteration in range(y_train.shape[0] // batch_size):
X_batch = X_train[\
iteration*batch_size:(iteration + 1)*batch_size,:]
y_batch = y_train[\
iteration*batch_size:(iteration + 1)*batch_size]
print(y_batch.shape, type(y_batch))
# sess.run(training_op, feed_dict={X: X_batch,
# y: y_batch})
# #Measure performance
# acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
# acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
# if (epoch % 1 == 0):
# print(epoch,
# "Train Accuracy:",
# '{:0.1%}'.format(acc_train),
# "\tTest Accuracy:",
# '{:0.1%}'.format(acc_test))
# results.append([epoch, acc_train, acc_test])
Error is as follows.
X.shape = (100, 64, 64, 1) Tensor("Rank:0", shape=(), dtype=int32)
y.shape = (100, 1) Tensor("Rank_1:0", shape=(), dtype=int32)
Stride: 1
Conv 0.shape = [100, 64, 64, 5]
Pool 0.shape = (100, 64, 64, 5)
Flattened.shape = (100, 20480)
Hidden.shape = (100, 20480) Tensor("cnn/Rank:0", shape=(), dtype=int32)
Logits.shape = (100, 2) Tensor("cnn/Rank_1:0", shape=(), dtype=int32)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-7961eb9a772c> in <module>()
58 #Define loss function
59 with tf.name_scope("loss"):
---> 60 xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=y, logits=logits)
61 loss = tf.reduce_mean(xentropy, name="loss")
62
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_ops.py in sparse_softmax_cross_entropy_with_logits(_sentinel, labels, logits, name)
2645 raise ValueError("Rank mismatch: Rank of labels (received %s) should "
2646 "equal rank of logits minus 1 (received %s)." %
-> 2647 (labels_static_shape.ndims, logits.get_shape().ndims))
2648 if (static_shapes_fully_defined and
2649 labels_static_shape != logits.get_shape()[:-1]):
ValueError: Rank mismatch: Rank of labels (received 2) should equal rank of logits minus 1 (received 2).
OK, I figured it out. I adjusted 2 lines as follows.
I dropped the extra dimension from shape of y as follows.
y = tf.placeholder(dtype=tf.int64, shape=[None], name="y")
All references to y_batch after its definition, were replaced with y_batch.reshape(-1). This was needed to reduce the rank of y_batch to get rid of the error.
Rest remained unchanged. Now I have a new problem, the accuracy remains low, but at least now its behaving itself (and not going to zero). Playtime!
0 Train Accuracy: 61.0% Test Accuracy: 48.5%
1 Train Accuracy: 60.0% Test Accuracy: 48.8%
2 Train Accuracy: 61.0% Test Accuracy: 49.5%
3 Train Accuracy: 65.0% Test Accuracy: 50.2%
4 Train Accuracy: 65.0% Test Accuracy: 51.0%
5 Train Accuracy: 64.0% Test Accuracy: 51.0%
6 Train Accuracy: 65.0% Test Accuracy: 51.5%
7 Train Accuracy: 66.0% Test Accuracy: 51.0%
8 Train Accuracy: 64.0% Test Accuracy: 51.2%
9 Train Accuracy: 63.0% Test Accuracy: 52.5%
10 Train Accuracy: 62.0% Test Accuracy: 52.0%
11 Train Accuracy: 62.0% Test Accuracy: 52.0%
12 Train Accuracy: 63.0% Test Accuracy: 53.5%
13 Train Accuracy: 63.0% Test Accuracy: 53.5%
14 Train Accuracy: 63.0% Test Accuracy: 54.0%
15 Train Accuracy: 63.0% Test Accuracy: 53.5%
16 Train Accuracy: 64.0% Test Accuracy: 53.5%
17 Train Accuracy: 64.0% Test Accuracy: 53.8%
18 Train Accuracy: 65.0% Test Accuracy: 53.8%
19 Train Accuracy: 65.0% Test Accuracy: 53.8%