TensorFlow CNN: Why validation loss are significantly different from the start and has been increasing? - tensorflow

This is a classification model for ten categories of pictures. My code has three files, one is the CNN model convNet.py, one is read_TFRecord.py to read data, one is train.py to train and evaluation model. Training set of samples of 80,000, validation set of sample of 20,000.
Question:
In the first epoch:
training loss = 2.11, train accuracy = 25.61%
validation loss = 3.05, validation accuracy = 8.29%
Why validation loss are significantly different right from the start? And why the validation accuracy is always below 10%?
In the 10 epoch of training:
The training process is always in normal learning. The validation loss in the slow increase, the validation accuracy has been shock in about 10%. Is it over-fitting? But I have taken some measures, such as adding regularized losses, droupout. I do not know where the problem is. I hope you can help me.
convNet.py:
def convNet(features, mode):
input_layer = tf.reshape(features, [-1, 100, 100, 3])
tf.summary.image('input', input_layer)
# conv1
with tf.name_scope('conv1'):
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=5,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv1'
)
conv1_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv1')
tf.summary.histogram('kernel', conv1_vars[0])
tf.summary.histogram('bias', conv1_vars[1])
tf.summary.histogram('act', conv1)
# pool1 100->50
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2, name='pool1')
# dropout
pool1_dropout = tf.layers.dropout(
inputs=pool1, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool1_dropout')
# conv2
with tf.name_scope('conv2'):
conv2 = tf.layers.conv2d(
inputs=pool1_dropout,
filters=64,
kernel_size=5,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv2'
)
conv2_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv2')
tf.summary.histogram('kernel', conv2_vars[0])
tf.summary.histogram('bias', conv2_vars[1])
tf.summary.histogram('act', conv2)
# pool2 50->25
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2, name='pool2')
# dropout
pool2_dropout = tf.layers.dropout(
inputs=pool2, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool2_dropout')
# conv3
with tf.name_scope('conv3'):
conv3 = tf.layers.conv2d(
inputs=pool2_dropout,
filters=128,
kernel_size=3,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv3'
)
conv3_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv3')
tf.summary.histogram('kernel', conv3_vars[0])
tf.summary.histogram('bias', conv3_vars[1])
tf.summary.histogram('act', conv3)
# pool3 25->12
pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=2, name='pool3')
# dropout
pool3_dropout = tf.layers.dropout(
inputs=pool3, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool3_dropout')
# conv4
with tf.name_scope('conv4'):
conv4 = tf.layers.conv2d(
inputs=pool3_dropout,
filters=128,
kernel_size=3,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv4'
)
conv4_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv4')
tf.summary.histogram('kernel', conv4_vars[0])
tf.summary.histogram('bias', conv4_vars[1])
tf.summary.histogram('act', conv4)
# pool4 12->6
pool4 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[2, 2], strides=2, name='pool4')
# dropout
pool4_dropout = tf.layers.dropout(
inputs=pool4, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool4_dropout')
pool4_flat = tf.reshape(pool4_dropout, [-1, 6 * 6 * 128])
# fc1
with tf.name_scope('fc1'):
fc1 = tf.layers.dense(inputs=pool4_flat, units=1024, activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
kernel_regularizer=tf.contrib.layers.l2_regularizer(0.01),
name='fc1')
fc1_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'fc1')
tf.summary.histogram('kernel', fc1_vars[0])
tf.summary.histogram('bias', fc1_vars[1])
tf.summary.histogram('act', fc1)
# dropout
fc1_dropout = tf.layers.dropout(
inputs=fc1, rate=0.3, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='fc1_dropout')
# fc2
with tf.name_scope('fc2'):
fc2 = tf.layers.dense(inputs=fc1_dropout, units=512, activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
kernel_regularizer=tf.contrib.layers.l2_regularizer(0.01),
name='fc2')
fc2_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'fc2')
tf.summary.histogram('kernel', fc2_vars[0])
tf.summary.histogram('bias', fc2_vars[1])
tf.summary.histogram('act', fc2)
# dropout
fc2_dropout = tf.layers.dropout(
inputs=fc2, rate=0.3, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='fc2_dropout')
# logits
with tf.name_scope('out'):
logits = tf.layers.dense(inputs=fc2_dropout, units=10, activation=None,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
kernel_regularizer=tf.contrib.layers.l2_regularizer(0.01),
name='out')
out_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'out')
tf.summary.histogram('kernel', out_vars[0])
tf.summary.histogram('bias', out_vars[1])
tf.summary.histogram('act', logits)
return logits
read_TFRecord.py:
def read_and_decode(filename, width, height, channel):
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'img_raw': tf.FixedLenFeature([], tf.string),
})
img = tf.decode_raw(features['img_raw'], tf.uint8)
img = tf.reshape(img, [width, height, channel])
img = tf.cast(img, tf.float16) * (1. / 255) - 0.5
label = tf.cast(features['label'], tf.int16)
return img, label
train.py:
# step 1
TRAIN_TFRECORD = 'F:/10-image-set2/train.tfrecords' # train data set
VAL_TFRECORD = 'F:/10-image-set2/val.tfrecords' # validation data set
WIDTH = 100 # image width
HEIGHT = 100 # image height
CHANNEL = 3 # image channel
TRAIN_BATCH_SIZE = 64
VAL_BATCH_SIZE = 16
train_img, train_label = read_and_decode(TRAIN_TFRECORD, WIDTH, HEIGHT,
CHANNEL)
val_img, val_label = read_and_decode(VAL_TFRECORD, WIDTH, HEIGHT, CHANNEL)
x_train_batch, y_train_batch = tf.train.shuffle_batch([train_img,
train_label], batch_size=TRAIN_BATCH_SIZE,
capacity=80000,min_after_dequeue=79999,
num_threads=64,name='train_shuffle_batch')
x_val_batch, y_val_batch = tf.train.shuffle_batch([val_img, val_label],
batch_size=VAL_BATCH_SIZE,
capacity=20000,min_after_dequeue=19999,
num_threads=64, name='val_shuffle_batch')
# step 2
x = tf.placeholder(tf.float32, shape=[None, WIDTH, HEIGHT, CHANNEL],
name='x')
y_ = tf.placeholder(tf.int32, shape=[None, ], name='y_')
mode = tf.placeholder(tf.string, name='mode')
step = tf.get_variable(shape=(), dtype=tf.int32, initializer=tf.zeros_initializer(), name='step')
tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, step)
logits = convNet(x, mode)
with tf.name_scope('Reg_losses'):
reg_losses = tf.cond(tf.equal(mode, learn.ModeKeys.TRAIN),
lambda: tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
lambda: tf.constant(0, dtype=tf.float32))
with tf.name_scope('Loss'):
loss = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits) + reg_losses
train_op = tf.train.AdamOptimizer().minimize(loss, step)
correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_)
with tf.name_scope('Accuracy'):
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# step 3
tf.summary.scalar("reg_losses", reg_losses)
tf.summary.scalar("loss", loss)
tf.summary.scalar("accuracy", acc)
merged = tf.summary.merge_all()
# step 4
with tf.Session() as sess:
summary_dir = './logs/summary/'
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver = tf.train.Saver(max_to_keep=1)
train_writer = tf.summary.FileWriter(summary_dir + 'train',
sess.graph)
valid_writer = tf.summary.FileWriter(summary_dir + 'valid')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
max_acc = 0
MAX_EPOCH = 10
for epoch in range(MAX_EPOCH):
# training
train_step = int(80000 / TRAIN_BATCH_SIZE)
train_loss, train_acc = 0, 0
for step in range(epoch * train_step, (epoch + 1) * train_step):
x_train, y_train = sess.run([x_train_batch, y_train_batch])
train_summary, _, err, ac = sess.run([merged, train_op, loss, acc],
feed_dict={x: x_train, y_: y_train,
mode: learn.ModeKeys.TRAIN,
global_step: step})
train_loss += err
train_acc += ac
if (step + 1) % 50 == 0:
train_writer.add_summary(train_summary, step)
print("Epoch %d,train loss= %.2f,train accuracy=%.2f%%" % (
epoch, (train_loss / train_step), (train_acc / train_step * 100.0)))
# validation
val_step = int(20000 / VAL_BATCH_SIZE)
val_loss, val_acc = 0, 0
for step in range(epoch * val_step, (epoch + 1) * val_step):
x_val, y_val = sess.run([x_val_batch, y_val_batch])
val_summary, err, ac = sess.run([merged, loss, acc],
feed_dict={x: x_val, y_: y_val, mode: learn.ModeKeys.EVAL,
global_step: step})
val_loss += err
val_acc += ac
if (step + 1) % 50 == 0:
valid_writer.add_summary(val_summary, step)
print(
"Epoch %d,validation loss= %.2f,validation accuracy=%.2f%%" % (
epoch, (val_loss / val_step), (val_acc / val_step * 100.0)))
# save model
if val_acc > max_acc:
max_acc = val_acc
saver.save(sess, summary_dir + '/10-image.ckpt', epoch)
print("model saved")
coord.request_stop()
coord.join(threads)
Tensorboard result:
(Orange is train.Blue is validation.)
accuracy-loss-reg_losses-conv1-conv2-conv3-conv4-fc1-fc2-output
My data:
train-val

I doubt this is an issue of overfitting - the losses are significantly different right from the start and diverge further well before you get through your first epoch (~500 batches). Without seeing your dataset its difficult to say more, though as a first step I'd encourage you to visualize both training and evaluation input data to make sure the issue isn't something there. The fact that you get significantly less than 10% on a 10-class classification problem initially indicates you almost certainly don't have something wrong here.
Having said that, you will likely run into problems with overfitting using this model because, despite what you may think, you aren't using dropout or regularization.
Dropout: mode == learn.ModeKeys is false if mode is a tensor, so you're not using dropout ever. You could use tf.equals(mode, learn.ModeKeys), but I think you'd be much better off passing a training bool tensor to your convNet and feeding in the appropriate value.
Regularization: you're creating the regularization loss terms and they're being added to the tf.GraphKeys.REGULARIZATION_LOSSES collection, but the loss you're minimizing doesn't use them. Add the following:
loss += tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
before you optimize.
A note on the optimization step: you shouldn't be feeding in a value to the session run like you are. Every time you run to optimization operation it will update the value passed to step when you created it, so just create it with an int variable and leave it alone. See the following example code:
import tensorflow as tf
x = tf.get_variable(shape=(4, 3), dtype=tf.float32,
initializer=tf.random_normal_initializer(), name='x')
loss = tf.nn.l2_loss(x)
step = tf.get_variable(shape=(), dtype=tf.int32,
initializer=tf.zeros_initializer(), name='step')
tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, step) # good practice
opt = tf.train.AdamOptimizer().minimize(loss, step)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
step_val = sess.run(step)
print(step_val) # 0
sess.run(opt)
step_val = sess.run(step)
print(step_val) # 1

Related

How to minimize the loss?

This should be a regression problem.
I would like the Neural Network to be able to estimate the length of a line, in pixel, from an image, like this 3 images, each image is 200 x 200 pcs:
a)b)c)
Training image of 6000 pcs, and validation image of 1000 pcs.
The labels are the distance in pixel:
a) 1.205404496424333018e+02
b) 1.188780888137086436e+02
c) 1.110180165558725918e+02
Here is my training code:
img_size = 200
def preprocess_image(image):
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, [img_size, img_size])
image /= 255.0 # normalize to [0,1] range
return image
def load_and_preprocess_image(path):
image = tf.read_file(path)
return preprocess_image(image)
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16
train_labels = np.loadtxt("train_labels.txt")
val_labels = np.loadtxt("test_labels.txt")
train_images = sorted(glob.glob("train_img/img_*.jpg"))
val_images = sorted(glob.glob("test_img/img_*.jpg"))
steps_per_epoch_count=tf.ceil(len(train_images)/BATCH_SIZE)
train_path_ds = tf.data.Dataset.from_tensor_slices(train_images)
val_path_ds = tf.data.Dataset.from_tensor_slices(val_images)
train_image_ds = train_path_ds.map(load_and_preprocess_image,
num_parallel_calls = AUTOTUNE)
train_label_ds =
tf.data.Dataset.from_tensor_slices(tf.cast(train_labels, tf.float32))
train_image_label_ds = tf.data.Dataset.zip((train_image_ds,
train_label_ds))
val_image_ds = val_path_ds.map(load_and_preprocess_image,
num_parallel_calls = AUTOTUNE)
val_label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(val_labels, tf.float32))
val_image_label_ds = tf.data.Dataset.zip((val_image_ds, val_label_ds))
model = tf.keras.models.Sequential([
tf.keras.layers.Convolution2D(16,3,3, input_shape=(img_size,
img_size, 3), activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Convolution2D(32,3,3, activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
# tf.keras.layers.Convolution2D(64,3,3, activation = 'relu'),
# tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(400, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(200, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(100, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.05),
tf.keras.layers.Dense(1, activation=tf.nn.relu)
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.01),
loss = "mean_squared_error",
metrics = ["mean_absolute_error", "mean_squared_error"]
)
train_ds = train_image_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=len(train_images)))
train_ds = train_ds.batch(BATCH_SIZE)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_image_label_ds.apply(
tf.data.experimental.shuffle_and_repeat(buffer_size=len(val_images)))
val_ds = val_ds.batch(BATCH_SIZE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
history = model.fit(
train_ds,
epochs = 80,
validation_data = val_ds,
steps_per_epoch = 374,
validation_steps = 62
)
However, this is the train vs eval mean_squared_error plot:
Question:
Why is the validation loss not stable?
The average Mean Squared Error is about 400 in training, which seems too high. What modification I can do to improve the estimation?
EDIT:
This is my latest model:
Learning rate = 0.01
Batch size = 16
model = tf.keras.models.Sequential([
tf.keras.layers.Convolution2D(16,3,3, input_shape=(img_size, img_size, 3), activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Convolution2D(32,3,3, activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(2, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2, activation=tf.nn.relu), #, kernel_regularizer = tf.keras.regularizers.l2(0.001)
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2, activation=tf.nn.relu), #, kernel_regularizer = tf.keras.regularizers.l2(0.001)
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2, activation=tf.nn.relu), #, kernel_regularizer = tf.keras.regularizers.l2(0.001)
tf.keras.layers.Dense(1, activation="linear")
])
The output looks like this:
As you can see, the train and validation loss is almost identical. The mse loss are both stabilized around 2393, which square root to 48.91 pixel error, quite high.
What advice to lower it further? Is it normal?

Why is the reduce_mean applied to the output of sparse_softmax_cross_entropy_with_logits?

There are several tutorials that applied reduce_mean to the output of sparse_softmax_cross_entropy_with_logits. For example
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_conv))
or
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv))
Why is the reduce_mean applied to the output of sparse_softmax_cross_entropy_with_logits? Is it because we are using mini-batches, and so we want to calculate (using reduce_mean) the average loss over all samples of the mini-batch?
The reason is to get the average loss over the batch.
Generally you will train a neural network with input batches of size > 1, each element in the batch will produce a loss value so the easiest way to merge these into one value is to average.
I find something interesting~
first, let define sparse_vector as
sparse_vector = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv)
the sparse_vector is a vector, and we should calculate the summery of it, that why we should use the reduce_mean.
import numpy as np
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=False)
print(mnist.test.labels.shape)
print(mnist.train.labels.shape)
with tf.name_scope('inputs'):
X_ = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.int64, [None])
X = tf.reshape(X_, [-1, 28, 28, 1])
h_conv1 = tf.layers.conv2d(X, filters=32, kernel_size=5, strides=1,
padding='same', activation=tf.nn.relu, name='conv1')
h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=2, strides=2,
padding='same', name='pool1')
h_conv2 = tf.layers.conv2d(h_pool1, filters=64, kernel_size=5, strides=1,
padding='same',activation=tf.nn.relu, name='conv2')
h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=2, strides=2,
padding='same', name='pool2')
# flatten
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.layers.dense(h_pool2_flat, 1024, name='fc1', activation=tf.nn.relu)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, 0.5)
h_fc2 = tf.layers.dense(h_fc1_drop, units=10, name='fc2')
# y_conv = tf.nn.softmax(h_fc2)
y_conv = h_fc2
# print('Finished building network.')
# cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
sparse_vector = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv)
cross_entropy = tf.reduce_mean(sparse_vector)
sess.run(tf.global_variables_initializer())
# print(sparse_vector)
# print(cross_entropy)
# Tensor("SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits:0", shape=(?,), dtype=float32)
# Tensor("Mean:0", shape=(), dtype=float32)
batch = mnist.train.next_batch(10)
sparse_vector,cross_entropy = sess.run(
[sparse_vector,cross_entropy],
feed_dict={X_: batch[0], y_: batch[1]})
print(sparse_vector)
print(cross_entropy)
the output is
[2.2213464 2.2676413 2.3555744 2.3196406 2.0794516 2.394274 2.266591
2.3139718 2.345526 2.3952296]
2.2959247

GradientDescentOptimizer is giving less accuracy (~0.10) compared to AdamOptimizer(0.95) in convolutional neural net in Tensorflow

I am building a convolutional neural network for classifying MNIST data. I m using 2 conv layer and 2 fully connected layer.
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
def _net_params():
weights = {
'conv1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
'conv2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
'fc1': tf.Variable(tf.random_normal([7 * 7 * 64, 1024])),
'fc2': tf.Variable(tf.random_normal([1024, 10])),
}
biases = {
'conv1': tf.Variable(tf.random_normal([32]),tf.float32),
'conv2': tf.Variable(tf.random_normal([64]),tf.float32),
'fc1': tf.Variable(tf.random_normal([1024]),tf.float32),
'fc2': tf.Variable(tf.random_normal([10]),tf.float32),
}
return weights, biases
def _fc_layer(inputs, weights, biases):
return tf.add(tf.matmul(inputs, weights), biases)
def _conv_layer(inputs, weights, biases, stride=1, padding='SAME'):
layer = tf.nn.conv2d(input=inputs,filter=weights,
strides=[1, stride, stride, 1],padding=padding)
layer = tf.nn.bias_add(layer, biases)
return tf.nn.relu(layer)
def pool_layer(inputs):
pool = tf.nn.max_pool(inputs, ksize=[1, 2, 2, 1], strides=[1, 2,
2, 1], padding="SAME")
return pool
def conv_net(x):
weights, biases = _net_params()
x = tf.reshape(x, shape=[-1, 28, 28, 1])
# Conv layers
conv1 = _conv_layer(x, weights['conv1'], biases['conv1'])
pool1 = pool_layer(conv1)
conv2 = _conv_layer(pool1, weights['conv2'], biases['conv2'])
pool2 = pool_layer(conv2)
flattened = tf.reshape(pool2, [-1, 7 * 7 * 64])
fc1 = _fc_layer(flattened, weights['fc1'], biases['fc1'])
fc1 = tf.nn.relu(fc1)
fc2 = _fc_layer(fc1, weights['fc2'], biases['fc2'])
return fc2
def _training():
x = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.float32, [None, 10])
learning_rate_ = tf.placeholder(tf.float32)
pred = conv_net(x)
cost = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits=pred,
labels=y_))
optimizer = tf.train.AdamOptimizer(
learning_rate=learning_rate_).minimize(cost)
# optimizer = tf.train.GradientDescentOptimizer(
learning_rate=learning_rate_).minimize(cost)
correct = tf.equal(tf.argmax(pred, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
return x, y_, learning_rate_, optimizer, cost, accuracy
def main():
mnist = input_data.read_data_sets('tmp/data', one_hot=True)
n_epochs = 3
batch_size = 200
learning_rate = 0.005
x, y_, learning_rate_, optimizer, cost, accuracy = _training()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
current_epoch = 0
while current_epoch < n_epochs:
current_epoch += 1
print('epoch %s' % (current_epoch,))
current_batch = 1
while current_batch * batch_size <= len(
mnist.train.images):
current_batch += 1
batch_x, batch_y = mnist.train.next_batch(batch_size)
sess.run(fetches=optimizer, feed_dict={x: batch_x,
y_: batch_y, learning_rate_: learning_rate, })
if current_batch % 75 == 0:
loss, acc = sess.run([cost, accuracy], feed_dict=
{x: batch_x, y_: batch_y, learning_rate_: 0.,})
print(' batch %s: batch_loss=%s,
training_accuracy=%s' % (current_batch,
loss, acc,))
print('Training complete !')
print('Final accuracy is %s' % sess.run(accuracy, feed_dict=
{x: mnist.test.images, y_: mnist.test.labels,
learning_rate_: 0.}))
if __name__ == '__main__':
main()
(there might be some indentation error while putting this code here in stack overdlow)
When i used AdamOptimizer, i m getting accuracy >95%.
Accuracy for AdamOptimizer
But when i used GradientDescentOptimizer, i m getting accuracy of 10%.
accuracy for GradientDescentOptimizer
Do you know why i m getting this lower accuracy and how to fix this if i want to use GradientDescentOptimizer.
Thanks

Tensorflow logits and labels error, but are same shape

this question has been asked several times already, but I don't seem to be able to adapt previous solutions to my code. I would therefore appreciate any advice on how to solve this. I have tried using pdb and set a trace point right before the problem, which didn't give me much information.
I am adapting this tutorial to my problem:
https://www.oreilly.com/ideas/visualizing-convolutional-neural-networks
Data Shape:
x_train.shape: (1161, 68, 68, 1)
x_test.shape: (216, 68, 68, 1)
y_test.shape: (216,)
y_train.shape: (1161,)
Where the error occurs:
#Train the Model
steps = int(x_train.shape[0]/batchSize)
for i in range(numEpochs):
print(i)
accHist = []
accHist2 = []
#x_train, y_train = imf.shuffle(x_train, y_train)
for j in range(steps):
print(j)
#Calculate our current step
step = i * steps + j
#Feed forward batch of train images into graph and log accuracy
acc = sess.run([accuracy], feed_dict={X: x_train[(j*batchSize):((j+1)*batchSize),:,:,:], Y_: np.array(y_train[(j*batchSize):((j+1)*batchSize)]).reshape(1,30), keepRate1: 1, keepRate2: 1})
print(accHist)
accHist.append(acc)
#Back propigate using adam optimizer to update weights and biases.
sess.run(train_step, feed_dict={X: x_train[(j*batchSize):((j+1)*batchSize),:,:,:], Y_: np.array(y_train[(j*batchSize):((j+1)*batchSize)]).reshape(1,30), keepRate1: 0.2, keepRate2: 0.5})
print("success")
print('Epoch number {} Training Accuracy: {}'.format(i+1, np.mean(accHist)))
#Feed forward all test images into graph and log accuracy
for k in range(int(x_test.shape[0]/batchSize)):
acc = sess.run(accuracy, feed_dict={X: x_test[(k*batchSize):((k+1)*batchSize),:,:,:], Y_: np.array(y_test[(k*batchSize):((k+1)*batchSize)]).reshape(1,30), keepRate1: 1, keepRate2: 1})
accHist2.append(acc)
print("Test Set Accuracy: {}".format(np.mean(accHist2)))
I am getting the following error message:
InvalidArgumentError: logits and labels must be same size: logits_size=[30,30] labels_size=[1,30]
[[Node: cross_entropy_7/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](cross_entropy_7/Reshape, cross_entropy_7/Reshape_1)]]
Following the tutorial, I thought the logins were set here:
#FULLY CONNECTED 3 & SOFTMAX OUTPUT
with tf.name_scope('softmax') as scope:
fc2w = tf.Variable(tf.truncated_normal([512, classes], dtype=tf.float32,
stddev=1e-1), name='weights3_2')
fc2b = tf.Variable(tf.constant(1.0, shape=[classes], dtype=tf.float32),
trainable=True, name='biases3_2')
Ylogits = tf.nn.bias_add(tf.matmul(fc1_drop, fc2w), fc2b)
Y = tf.nn.softmax(Ylogits)
print(Ylogits.shape) here gives me: (?, 30). Classes is set at 30 so this seems to make sense.
This seems to be the functions that doesn't work, so I printed the shapes:
with tf.name_scope('cross_entropy'):
print(Ylogits.shape)
print(Y.shape)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
loss = tf.reduce_mean(cross_entropy)
Which gave me:
(?, 30)
(?, 30)
When executing the line for back propagation above though this does not seem to work. Can anyone help?
In response to comment (this basically is the tutorial code from the link mentioned above):
Place Holders:
classes = 30
X = tf.placeholder(tf.float32, name="X-placeholder", shape=(None, 68, 68, 1))
Y_ = tf.placeholder(tf.float32, [None, classes], name="Y_-placeholder")
keepRate1 = tf.placeholder(tf.float32, name="keepRate1-placeholder")
keepRate2 = tf.placeholder(tf.float32, name="keepRate2-placeholder")
Model:
# CONVOLUTION 1 - 1
with tf.name_scope('conv1_1'):
filter1_1 = tf.Variable(tf.truncated_normal([3, 3, 1, 32], dtype=tf.float32,
stddev=1e-1), name='weights1_1')
stride = [1,1,1,1]
conv = tf.nn.conv2d(X, filter1_1, stride, padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[32], dtype=tf.float32),
trainable=True, name='biases1_1')
out = tf.nn.bias_add(conv, biases)
conv1_1 = tf.nn.relu(out)
# CONVOLUTION 1 - 2
with tf.name_scope('conv1_2'):
filter1_2 = tf.Variable(tf.truncated_normal([3, 3, 32, 32], dtype=tf.float32,
stddev=1e-1), name='weights1_2')
conv = tf.nn.conv2d(conv1_1, filter1_2, [1,1,1,1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[32], dtype=tf.float32),
trainable=True, name='biases1_2')
out = tf.nn.bias_add(conv, biases)
conv1_2 = tf.nn.relu(out)
# POOL 1
with tf.name_scope('pool1'):
pool1_1 = tf.nn.max_pool(conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1_1')
pool1_1_drop = tf.nn.dropout(pool1_1, keepRate1)
# CONVOLUTION 2 - 1
with tf.name_scope('conv2_1'):
filter2_1 = tf.Variable(tf.truncated_normal([3, 3, 32, 64], dtype=tf.float32,
stddev=1e-1), name='weights2_1')
conv = tf.nn.conv2d(pool1_1_drop, filter2_1, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
trainable=True, name='biases2_1')
out = tf.nn.bias_add(conv, biases)
conv2_1 = tf.nn.relu(out)
# CONVOLUTION 2 - 2
with tf.name_scope('conv2_2'):
filter2_2 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], dtype=tf.float32,
stddev=1e-1), name='weights2_2')
conv = tf.nn.conv2d(conv2_1, filter2_2, [1, 1, 1, 1], padding='SAME')
biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
trainable=True, name='biases2_2')
out = tf.nn.bias_add(conv, biases)
conv2_2 = tf.nn.relu(out)
# POOL 2
with tf.name_scope('pool2'):
pool2_1 = tf.nn.max_pool(conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2_1')
pool2_1_drop = tf.nn.dropout(pool2_1, keepRate1)
#FULLY CONNECTED 1
with tf.name_scope('fc1') as scope:
shape = int(np.prod(pool2_1_drop.get_shape()[1:]))
fc1w = tf.Variable(tf.truncated_normal([shape, 512], dtype=tf.float32,
stddev=1e-1), name='weights3_1')
fc1b = tf.Variable(tf.constant(1.0, shape=[512], dtype=tf.float32),
trainable=True, name='biases3_1')
pool2_flat = tf.reshape(pool2_1_drop, [-1, shape])
out = tf.nn.bias_add(tf.matmul(pool2_flat, fc1w), fc1b)
fc1 = tf.nn.relu(out)
fc1_drop = tf.nn.dropout(fc1, keepRate2)
#FULLY CONNECTED 3 & SOFTMAX OUTPUT
with tf.name_scope('softmax') as scope:
fc2w = tf.Variable(tf.truncated_normal([512, classes], dtype=tf.float32,
stddev=1e-1), name='weights3_2')
fc2b = tf.Variable(tf.constant(1.0, shape=[classes], dtype=tf.float32),
trainable=True, name='biases3_2')
Ylogits = tf.nn.bias_add(tf.matmul(fc1_drop, fc2w), fc2b)
Y = tf.nn.softmax(Ylogits)
numEpochs = 400
batchSize = 30
alpha = 1e-5
with tf.name_scope('cross_entropy'):
print(Ylogits.shape)
print(Y.shape)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
loss = tf.reduce_mean(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('train'):
train_step = tf.train.AdamOptimizer(learning_rate=alpha).minimize(loss)
#Create Session and insert variables
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
The tensor shape (?, 30) denotes that the batch size is not set, so you can feed any batch size data to your graph, the problem is that then you can run into these kinds of problems, and need to keep track of the tensor shapes in your head.
The thing you need to fix is: either you have 30 images in one batch, but only 1 label in one batch, which needs to be fixed, because you cannot compute loss for 30 images with only one label, you either need to decrease number of images to 1 or increase label batch size to 30, it could also be that somewhere you are reshaping the tensors incorrectly.
I would look at where you read your data in, and then batch it, that is most likely where the problem will be, or at places where you are reshaping them.
Post your entire code, it would be more helpful.

Tensorflow train CNN but accuracy invariable

First Step
Second step
Third step
Fourth step
The loss is gradual decline.
But the accuracy is always around the 50%.
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
def conv_net(x, weights, biases, dropout):
# Tensor input become 4-D: [Batch Size, Height, Width, Channel]
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1)
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
# Output, class prediction
out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
return out
# Store layers weight & bias
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([7*7*64, 1024])),
# 1024 inputs, 10 outputs (class prediction)
'out': tf.Variable(tf.random_normal([1024, num_classes]))
}
biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bd1': tf.Variable(tf.random_normal([1024])),
'out': tf.Variable(tf.random_normal([num_classes]))
}
# Construct model
logits = conv_net(X, weights, biases, keep_prob)
prediction = tf.nn.softmax(logits)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss_op)
# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
#Saver use to store the model
saver = tf.train.Saver()
from sklearn.model_selection import train_test_split
# Start training
with tf.Session() as sess:
# Run the initializer
sess.run(init)
for epoch in range(1, numOfEpoch):
train_x, val_x, train_y, val_y = train_test_split(Input, Labels, test_size = 0.1)
for i in range(0, len(train_x), batch_size):
trainLoss, _ = sess.run([loss_op, optimizer], feed_dict = {
X: train_x[i: i+batch_size],
Y: train_y[i: i+batch_size],
keep_prob: dropout
})
if i % 5 == 0:
print("The step is in "+ str(i)+ " step")
valAcc, valLoss = sess.run([accuracy, loss_op], feed_dict={
X: val_x,
Y: val_y,
keep_prob: 1.0})
print("Step " + str(epoch) + ", Minibatch Loss= " + \
"{:.4f}".format(valLoss) + ", Training Accuracy= " + \
"{:.3f}".format(valAcc))
print("Optimization Finished!")
saver.save(sess, "../model.ckpt")
Above is the whole code.
The image is [28 * 28 * 1]
The preprocessing for the image is normalization.
And through each epoch, the loss is always down. After sever epoch, the loss is near to 0.72.
But the accuracy is still around the 50%. When the parameters initialize, the accuracy is already around the 50%. It's never change a lot during the train.
There is also some strange things in prediction.Because the output of prediction is near to 1 and 0, rather than the float value between 1 and 0.
When I change the initializer to xavier initializer.
It seems to be normal.