How to minimize the loss? - tensorflow

This should be a regression problem.
I would like the Neural Network to be able to estimate the length of a line, in pixel, from an image, like this 3 images, each image is 200 x 200 pcs:
a)b)c)
Training image of 6000 pcs, and validation image of 1000 pcs.
The labels are the distance in pixel:
a) 1.205404496424333018e+02
b) 1.188780888137086436e+02
c) 1.110180165558725918e+02
Here is my training code:
img_size = 200
def preprocess_image(image):
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, [img_size, img_size])
image /= 255.0 # normalize to [0,1] range
return image
def load_and_preprocess_image(path):
image = tf.read_file(path)
return preprocess_image(image)
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16
train_labels = np.loadtxt("train_labels.txt")
val_labels = np.loadtxt("test_labels.txt")
train_images = sorted(glob.glob("train_img/img_*.jpg"))
val_images = sorted(glob.glob("test_img/img_*.jpg"))
steps_per_epoch_count=tf.ceil(len(train_images)/BATCH_SIZE)
train_path_ds = tf.data.Dataset.from_tensor_slices(train_images)
val_path_ds = tf.data.Dataset.from_tensor_slices(val_images)
train_image_ds = train_path_ds.map(load_and_preprocess_image,
num_parallel_calls = AUTOTUNE)
train_label_ds =
tf.data.Dataset.from_tensor_slices(tf.cast(train_labels, tf.float32))
train_image_label_ds = tf.data.Dataset.zip((train_image_ds,
train_label_ds))
val_image_ds = val_path_ds.map(load_and_preprocess_image,
num_parallel_calls = AUTOTUNE)
val_label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(val_labels, tf.float32))
val_image_label_ds = tf.data.Dataset.zip((val_image_ds, val_label_ds))
model = tf.keras.models.Sequential([
tf.keras.layers.Convolution2D(16,3,3, input_shape=(img_size,
img_size, 3), activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Convolution2D(32,3,3, activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
# tf.keras.layers.Convolution2D(64,3,3, activation = 'relu'),
# tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(400, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(200, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(100, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.05),
tf.keras.layers.Dense(1, activation=tf.nn.relu)
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.01),
loss = "mean_squared_error",
metrics = ["mean_absolute_error", "mean_squared_error"]
)
train_ds = train_image_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=len(train_images)))
train_ds = train_ds.batch(BATCH_SIZE)
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_image_label_ds.apply(
tf.data.experimental.shuffle_and_repeat(buffer_size=len(val_images)))
val_ds = val_ds.batch(BATCH_SIZE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
history = model.fit(
train_ds,
epochs = 80,
validation_data = val_ds,
steps_per_epoch = 374,
validation_steps = 62
)
However, this is the train vs eval mean_squared_error plot:
Question:
Why is the validation loss not stable?
The average Mean Squared Error is about 400 in training, which seems too high. What modification I can do to improve the estimation?
EDIT:
This is my latest model:
Learning rate = 0.01
Batch size = 16
model = tf.keras.models.Sequential([
tf.keras.layers.Convolution2D(16,3,3, input_shape=(img_size, img_size, 3), activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Convolution2D(32,3,3, activation = 'relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(2, activation=tf.nn.relu),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2, activation=tf.nn.relu), #, kernel_regularizer = tf.keras.regularizers.l2(0.001)
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2, activation=tf.nn.relu), #, kernel_regularizer = tf.keras.regularizers.l2(0.001)
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2, activation=tf.nn.relu), #, kernel_regularizer = tf.keras.regularizers.l2(0.001)
tf.keras.layers.Dense(1, activation="linear")
])
The output looks like this:
As you can see, the train and validation loss is almost identical. The mse loss are both stabilized around 2393, which square root to 48.91 pixel error, quite high.
What advice to lower it further? Is it normal?

Related

Keras model.fit does not complain incompatible shape?

I defined my model starting with inputs = tf.keras.Input(shape=(512, 512, 3), batch_size=BATCH_SIZE). Then I use model.fit with data of shape (1, 720, 1280, 3). The model still trains normally and the loss decreases. Why is that?
Thanks.
I tried to train with same shape. But the result did not turns out to be good.
So here is the code:
inputs = tf.keras.Input(shape=(512, 512, 3), batch_size=BATCH_SIZE) # REVISE,
x = tf.keras.layers.Cropping2D(cropping=((256, 0), (0, 0)))(inputs)
x = tf.keras.layers.Resizing(66,200)(x)
# x = x / 255
x = tf.keras.layers.Conv2D(24, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(36, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(48, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(64, (3,3), activation="elu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Conv2D(64, (3,3), activation="elu")(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(100, activation='elu')(x)
x = tf.keras.layers.Dense(50, activation='elu')(x)
x = tf.keras.layers.Dense(10, activation='elu')(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
optimizer = Adam(learning_rate=1e-3)
model.compile(loss='mse', optimizer=optimizer)
img_gen = image_data_generator(X_train, y_train, batch_size=1)
X = next(img_gen)[0]
print(X.shape)
print(model(X).shape)
# saves the model weights after each epoch if the validation loss decreased
checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath=os.path.join(model_output_dir,'lane_navigation_new_nvmodel_big2'), verbose=1, save_best_only=True) # revise
with tf.device('/device:GPU:0'):
history = model.fit(image_data_generator( X_train, y_train, batch_size=BATCH_SIZE), # 重要!batch_size大了好训练,1个batch_size训不出东西!
steps_per_epoch=10, # 300
epochs=300,
validation_data = image_data_generator( X_valid, y_valid, batch_size=BATCH_SIZE),
validation_steps=10, # 200
verbose=1,
shuffle=1,
callbacks=[checkpoint_callback])

CNN trained on MNIST data set always show same output for all inputs

I am using a Keras CNN for handwritten digit recognition. I downloaded dataset from Kaggle.The preprocessing is :
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
Y_train = train["label"]
X_train = train.drop(labels = ["label"],axis = 1)
X_train = X_train / 255.0
test = test / 255.0
X_train = X_train.values.reshape(-1,28,28,1)
test = test.values.reshape(-1,28,28,1)
Y_train = to_categorical(Y_train, num_classes = 10)
random_seed = 2
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=random_seed)
My model looks like:
model = Sequential()
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',
activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same',
activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',
activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same',
activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
patience=3,
verbose=1,
factor=0.5,
min_lr=0.00001)
epochs = 1
batch_size = 86
datagen = ImageDataGenerator(
featurewise_center=False, # set input mean to 0 over the dataset
samplewise_center=False, # set each sample mean to 0
featurewise_std_normalization=False, # divide inputs by std of the dataset
samplewise_std_normalization=False, # divide each input by its std
zca_whitening=False, # apply ZCA whitening
rotation_range=10, # randomly rotate images in the range (degrees, 0 to 180)
zoom_range = 0.1, # Randomly zoom image
width_shift_range=0.1, # randomly shift images horizontally (fraction of total width)
height_shift_range=0.1, # randomly shift images vertically (fraction of total height)
horizontal_flip=False, # randomly flip images
vertical_flip=False) # randomly flip images
datagen.fit(X_train)
history = model.fit(datagen.flow(X_train,Y_train, batch_size=batch_size),
epochs = epochs,
validation_data = (X_val,Y_val),
verbose = 2,
steps_per_epoch=X_train.shape[0] // batch_size,
callbacks=[learning_rate_reduction])
Predicting my input:
def predict(image):
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.resize(image, (28, 28))
image = image.astype('float32')
image = image.reshape(1, 28, 28, 1)
image /= 255
model = load_model('./model.h5')
pred = model.predict(image, batch_size=1)
print("Predicted Number: ", pred.argmax())
predict(cv2.imread('./testImages/1.png'))
What am I doing wrong?
Desired result is the digit that is provided as an input image, instead I get the same output (i.e. digit 8) for every input.

HIGH Resolution Image and Colab PRO Crash

I have 1700 images of 1000*1000 Image height and Width. There are minor details in it, so I prefer to keep this size. Now, my google colab pro crashes. Please Help.
'''
##title IMAGE TO DATA, NORMALIZATION AND AUGMENTATION
#Directories with Subdirectories as Classes for training and validation datasets
%%capture
train_dir = '/content/Dataset/Training'
validation_dir = '/content/Dataset/Validation'
# Set batch size and Image Height and Width
batch_size = 32
IMG_HEIGHT, IMG_WIDTH = (1000,1000)
#Image to Data Transform using ImageDataGenerator of Keras
#Image to Data for Training Data
Dataset_Image_Training = ImageDataGenerator(rescale = 1./255, zoom_range=[0.8, 1.5], brightness_range= [0.8, 2.0])
train_data_gen = Dataset_Image_Training.flow_from_directory(
batch_size= batch_size,
directory=train_dir,
shuffle=True,
target_size=(IMG_HEIGHT,IMG_WIDTH),
class_mode='binary')
#Image to Data for Validation Data
validation_image_generator = ImageDataGenerator(rescale=1./255, zoom_range=[0.8, 1.5], brightness_range= [0.8, 2.0])
val_data_gen = validation_image_generator.flow_from_directory(
batch_size=batch_size,
directory= validation_dir,
shuffle=True,
target_size=(IMG_HEIGHT,IMG_WIDTH),
class_mode= 'binary')
#Check Classes in Dataset
train_data_gen.class_indices
##title Deep Learning CNN Model with Keras Seqential with **Dropout**
#%%capture
model = Sequential([
Conv2D(32, (3,3), padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPool2D(2,2),
Dropout(0.5),
Conv2D(64, (3,3), padding='same', activation='relu'),
MaxPool2D(2,2),
Dropout(0.5),
Conv2D(128, (3,3), padding='same', activation='relu'),
MaxPool2D(2,2),
Dropout(0.5),
Conv2D(256, (3,3), padding='same', activation='relu'),
MaxPool2D(2,2),
Dropout(0.5),
Flatten(),
Dense(512, activation='relu'),
Dropout(0.5),
Dense(1, activation='sigmoid')])
# Model Compilation
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
#Tensorboard Set up
import tensorflow as tf
import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
#Checkpoint and earlystop setting
filepath = '/content/drive/My Drive/DL_Model.hdf5'
checkpoint = [tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', mode='max', save_best_only=True, Save_weights_only = False, verbose = 1),
tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience = 15, verbose =1), [tensorboard_callback]]
#Model Fitting
hist = model.fit(
train_data_gen,
steps_per_epoch=None,
epochs=500,
validation_data=val_data_gen,
validation_steps=None,
callbacks = [checkpoint]
)
#Accuracy Print
train_acc = max(hist.history['accuracy'])
val_acc = max(hist.history['val_accuracy'])
train_loss = min(hist.history['loss'])
val_loss = min(hist.history['val_loss'])
print('Training accuracy is')
print(train_acc)
print('Validation accuracy is')
print(val_acc)
print('Training loss is')
print(train_loss)
print('Validation loss is')
print(val_loss)
#Load Tensorboard
%load_ext tensorboard
%tensorboard --logdir logs
'''

Stuck In First Epoch When Training CNN model in google Colab

I created a model to identify plant diseases. I expected to identify 10 diseases. in jupyter notebook, it worked fine but it was slow due to GPU constraints. Then I decided to run that model in google colab but it did not run. it stuck at the first epoch.
The code I use to construct the model is given below
BATCH_SIZE = 64
IMAGE_SIZE = 256
CHANNELS=3
EPOCHS=10
dataset = tf.keras.preprocessing.image_dataset_from_directory(
"/content/drive/MyDrive/google-colab-files/PlantVillage",
seed=123,
shuffle=True,
image_size=(IMAGE_SIZE,IMAGE_SIZE),
batch_size=BATCH_SIZE
)
def get_dataset_partisions_tf(ds,trains_split=0.8,val_split=0.1,test_split=0.1,shuffle=True,shuffle_size=10000):
ds_size = len(ds)
if shuffle:
ds = ds.shuffle(shuffle_size,seed=12)
train_size = int(trains_split * ds_size)
val_size = int(val_split * ds_size)
train_ds = ds.take(train_size)
val_ds = ds.skip(train_size).take(val_size)
test_ds = ds.skip(train_size).skip(val_size)
return train_ds,val_ds,test_ds
train_ds,val_ds,test_ds = get_dataset_partisions_tf(dataset)
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size = tf.data.AUTOTUNE)
val_ds = val_ds.cache().shuffle(1000).prefetch(buffer_size = tf.data.AUTOTUNE)
test_ds = test_ds.cache().shuffle(1000).prefetch(buffer_size = tf.data.AUTOTUNE)
resize_and_rescales = Sequential([
layers.experimental.preprocessing.Resizing(IMAGE_SIZE,IMAGE_SIZE),
layers.experimental.preprocessing.Rescaling(1.0/255)
])
data_agmetation = Sequential([
layers.experimental.preprocessing.RandomFlip('horizontal_and_vertical'),
layers.experimental.preprocessing.RandomRotation(0.2),
])
input_shape = (BATCH_SIZE,IMAGE_SIZE,IMAGE_SIZE,CHANNELS)
n_classes = 10
model = Sequential([
resize_and_rescales,
data_agmetation,
layers.Conv2D(32,(3,3), activation='relu',input_shape = input_shape),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64,kernel_size = (3,3), activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64,kernel_size = (3,3), activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64,(3,3), activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64,(3,3), activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Conv2D(64,(3,3), activation='relu'),
layers.MaxPooling2D((2,2)),
layers.Flatten(),
layers.Dense(64,activation='relu'),
layers.Dense(n_classes, activation='softmax'),
])
model.build(input_shape = input_shape)
model.summary()
A screenshot of the model summary is:
model.compile(
optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
metrics=['accuracy']
)
When I use the following code to train data:
model.fit(
train_ds,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
verbose=2,
validation_data=val_ds
)
it keeps stuck in the first epoch
Check if TensorFlow is using a GPU or not. You can try reducing batch size.
my assumption is that this is because of your verbose, you should set verbose to 1 to see the step of the epoch you are in.

TensorFlow CNN: Why validation loss are significantly different from the start and has been increasing?

This is a classification model for ten categories of pictures. My code has three files, one is the CNN model convNet.py, one is read_TFRecord.py to read data, one is train.py to train and evaluation model. Training set of samples of 80,000, validation set of sample of 20,000.
Question:
In the first epoch:
training loss = 2.11, train accuracy = 25.61%
validation loss = 3.05, validation accuracy = 8.29%
Why validation loss are significantly different right from the start? And why the validation accuracy is always below 10%?
In the 10 epoch of training:
The training process is always in normal learning. The validation loss in the slow increase, the validation accuracy has been shock in about 10%. Is it over-fitting? But I have taken some measures, such as adding regularized losses, droupout. I do not know where the problem is. I hope you can help me.
convNet.py:
def convNet(features, mode):
input_layer = tf.reshape(features, [-1, 100, 100, 3])
tf.summary.image('input', input_layer)
# conv1
with tf.name_scope('conv1'):
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=5,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv1'
)
conv1_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv1')
tf.summary.histogram('kernel', conv1_vars[0])
tf.summary.histogram('bias', conv1_vars[1])
tf.summary.histogram('act', conv1)
# pool1 100->50
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2, name='pool1')
# dropout
pool1_dropout = tf.layers.dropout(
inputs=pool1, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool1_dropout')
# conv2
with tf.name_scope('conv2'):
conv2 = tf.layers.conv2d(
inputs=pool1_dropout,
filters=64,
kernel_size=5,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv2'
)
conv2_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv2')
tf.summary.histogram('kernel', conv2_vars[0])
tf.summary.histogram('bias', conv2_vars[1])
tf.summary.histogram('act', conv2)
# pool2 50->25
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2, name='pool2')
# dropout
pool2_dropout = tf.layers.dropout(
inputs=pool2, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool2_dropout')
# conv3
with tf.name_scope('conv3'):
conv3 = tf.layers.conv2d(
inputs=pool2_dropout,
filters=128,
kernel_size=3,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv3'
)
conv3_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv3')
tf.summary.histogram('kernel', conv3_vars[0])
tf.summary.histogram('bias', conv3_vars[1])
tf.summary.histogram('act', conv3)
# pool3 25->12
pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=2, name='pool3')
# dropout
pool3_dropout = tf.layers.dropout(
inputs=pool3, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool3_dropout')
# conv4
with tf.name_scope('conv4'):
conv4 = tf.layers.conv2d(
inputs=pool3_dropout,
filters=128,
kernel_size=3,
padding="same",
activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
name='conv4'
)
conv4_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'conv4')
tf.summary.histogram('kernel', conv4_vars[0])
tf.summary.histogram('bias', conv4_vars[1])
tf.summary.histogram('act', conv4)
# pool4 12->6
pool4 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[2, 2], strides=2, name='pool4')
# dropout
pool4_dropout = tf.layers.dropout(
inputs=pool4, rate=0.5, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='pool4_dropout')
pool4_flat = tf.reshape(pool4_dropout, [-1, 6 * 6 * 128])
# fc1
with tf.name_scope('fc1'):
fc1 = tf.layers.dense(inputs=pool4_flat, units=1024, activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
kernel_regularizer=tf.contrib.layers.l2_regularizer(0.01),
name='fc1')
fc1_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'fc1')
tf.summary.histogram('kernel', fc1_vars[0])
tf.summary.histogram('bias', fc1_vars[1])
tf.summary.histogram('act', fc1)
# dropout
fc1_dropout = tf.layers.dropout(
inputs=fc1, rate=0.3, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='fc1_dropout')
# fc2
with tf.name_scope('fc2'):
fc2 = tf.layers.dense(inputs=fc1_dropout, units=512, activation=tf.nn.relu,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
kernel_regularizer=tf.contrib.layers.l2_regularizer(0.01),
name='fc2')
fc2_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'fc2')
tf.summary.histogram('kernel', fc2_vars[0])
tf.summary.histogram('bias', fc2_vars[1])
tf.summary.histogram('act', fc2)
# dropout
fc2_dropout = tf.layers.dropout(
inputs=fc2, rate=0.3, training=tf.equal(mode, learn.ModeKeys.TRAIN), name='fc2_dropout')
# logits
with tf.name_scope('out'):
logits = tf.layers.dense(inputs=fc2_dropout, units=10, activation=None,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.01),
kernel_regularizer=tf.contrib.layers.l2_regularizer(0.01),
name='out')
out_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'out')
tf.summary.histogram('kernel', out_vars[0])
tf.summary.histogram('bias', out_vars[1])
tf.summary.histogram('act', logits)
return logits
read_TFRecord.py:
def read_and_decode(filename, width, height, channel):
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(serialized_example,
features={
'label': tf.FixedLenFeature([], tf.int64),
'img_raw': tf.FixedLenFeature([], tf.string),
})
img = tf.decode_raw(features['img_raw'], tf.uint8)
img = tf.reshape(img, [width, height, channel])
img = tf.cast(img, tf.float16) * (1. / 255) - 0.5
label = tf.cast(features['label'], tf.int16)
return img, label
train.py:
# step 1
TRAIN_TFRECORD = 'F:/10-image-set2/train.tfrecords' # train data set
VAL_TFRECORD = 'F:/10-image-set2/val.tfrecords' # validation data set
WIDTH = 100 # image width
HEIGHT = 100 # image height
CHANNEL = 3 # image channel
TRAIN_BATCH_SIZE = 64
VAL_BATCH_SIZE = 16
train_img, train_label = read_and_decode(TRAIN_TFRECORD, WIDTH, HEIGHT,
CHANNEL)
val_img, val_label = read_and_decode(VAL_TFRECORD, WIDTH, HEIGHT, CHANNEL)
x_train_batch, y_train_batch = tf.train.shuffle_batch([train_img,
train_label], batch_size=TRAIN_BATCH_SIZE,
capacity=80000,min_after_dequeue=79999,
num_threads=64,name='train_shuffle_batch')
x_val_batch, y_val_batch = tf.train.shuffle_batch([val_img, val_label],
batch_size=VAL_BATCH_SIZE,
capacity=20000,min_after_dequeue=19999,
num_threads=64, name='val_shuffle_batch')
# step 2
x = tf.placeholder(tf.float32, shape=[None, WIDTH, HEIGHT, CHANNEL],
name='x')
y_ = tf.placeholder(tf.int32, shape=[None, ], name='y_')
mode = tf.placeholder(tf.string, name='mode')
step = tf.get_variable(shape=(), dtype=tf.int32, initializer=tf.zeros_initializer(), name='step')
tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, step)
logits = convNet(x, mode)
with tf.name_scope('Reg_losses'):
reg_losses = tf.cond(tf.equal(mode, learn.ModeKeys.TRAIN),
lambda: tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
lambda: tf.constant(0, dtype=tf.float32))
with tf.name_scope('Loss'):
loss = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits) + reg_losses
train_op = tf.train.AdamOptimizer().minimize(loss, step)
correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_)
with tf.name_scope('Accuracy'):
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# step 3
tf.summary.scalar("reg_losses", reg_losses)
tf.summary.scalar("loss", loss)
tf.summary.scalar("accuracy", acc)
merged = tf.summary.merge_all()
# step 4
with tf.Session() as sess:
summary_dir = './logs/summary/'
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver = tf.train.Saver(max_to_keep=1)
train_writer = tf.summary.FileWriter(summary_dir + 'train',
sess.graph)
valid_writer = tf.summary.FileWriter(summary_dir + 'valid')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
max_acc = 0
MAX_EPOCH = 10
for epoch in range(MAX_EPOCH):
# training
train_step = int(80000 / TRAIN_BATCH_SIZE)
train_loss, train_acc = 0, 0
for step in range(epoch * train_step, (epoch + 1) * train_step):
x_train, y_train = sess.run([x_train_batch, y_train_batch])
train_summary, _, err, ac = sess.run([merged, train_op, loss, acc],
feed_dict={x: x_train, y_: y_train,
mode: learn.ModeKeys.TRAIN,
global_step: step})
train_loss += err
train_acc += ac
if (step + 1) % 50 == 0:
train_writer.add_summary(train_summary, step)
print("Epoch %d,train loss= %.2f,train accuracy=%.2f%%" % (
epoch, (train_loss / train_step), (train_acc / train_step * 100.0)))
# validation
val_step = int(20000 / VAL_BATCH_SIZE)
val_loss, val_acc = 0, 0
for step in range(epoch * val_step, (epoch + 1) * val_step):
x_val, y_val = sess.run([x_val_batch, y_val_batch])
val_summary, err, ac = sess.run([merged, loss, acc],
feed_dict={x: x_val, y_: y_val, mode: learn.ModeKeys.EVAL,
global_step: step})
val_loss += err
val_acc += ac
if (step + 1) % 50 == 0:
valid_writer.add_summary(val_summary, step)
print(
"Epoch %d,validation loss= %.2f,validation accuracy=%.2f%%" % (
epoch, (val_loss / val_step), (val_acc / val_step * 100.0)))
# save model
if val_acc > max_acc:
max_acc = val_acc
saver.save(sess, summary_dir + '/10-image.ckpt', epoch)
print("model saved")
coord.request_stop()
coord.join(threads)
Tensorboard result:
(Orange is train.Blue is validation.)
accuracy-loss-reg_losses-conv1-conv2-conv3-conv4-fc1-fc2-output
My data:
train-val
I doubt this is an issue of overfitting - the losses are significantly different right from the start and diverge further well before you get through your first epoch (~500 batches). Without seeing your dataset its difficult to say more, though as a first step I'd encourage you to visualize both training and evaluation input data to make sure the issue isn't something there. The fact that you get significantly less than 10% on a 10-class classification problem initially indicates you almost certainly don't have something wrong here.
Having said that, you will likely run into problems with overfitting using this model because, despite what you may think, you aren't using dropout or regularization.
Dropout: mode == learn.ModeKeys is false if mode is a tensor, so you're not using dropout ever. You could use tf.equals(mode, learn.ModeKeys), but I think you'd be much better off passing a training bool tensor to your convNet and feeding in the appropriate value.
Regularization: you're creating the regularization loss terms and they're being added to the tf.GraphKeys.REGULARIZATION_LOSSES collection, but the loss you're minimizing doesn't use them. Add the following:
loss += tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
before you optimize.
A note on the optimization step: you shouldn't be feeding in a value to the session run like you are. Every time you run to optimization operation it will update the value passed to step when you created it, so just create it with an int variable and leave it alone. See the following example code:
import tensorflow as tf
x = tf.get_variable(shape=(4, 3), dtype=tf.float32,
initializer=tf.random_normal_initializer(), name='x')
loss = tf.nn.l2_loss(x)
step = tf.get_variable(shape=(), dtype=tf.int32,
initializer=tf.zeros_initializer(), name='step')
tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, step) # good practice
opt = tf.train.AdamOptimizer().minimize(loss, step)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
step_val = sess.run(step)
print(step_val) # 0
sess.run(opt)
step_val = sess.run(step)
print(step_val) # 1