Why the loss function in CNN become NAN after few epochs - tensorflow

I’m trying to do a binary classification on a very unbalanced dataset.
The model is doing great, but after some random epochs the loss becomes nan, also precision, recall, TP, and FP, all become ZERO.
Sometimes it happens after the 3rd epoch, sometimes after the 20th epoch.
The code:
import numpy as np
import os
from keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
import pandas
nodules_csv = pandas.read_csv("/cropped_nodules.csv")
base_dir = "/cropped_nodules/"
all_image_paths = os.listdir(base_dir)
all_image_paths = sorted(all_image_paths,key=lambda x: int(os.path.splitext(x)[0]))
nodules = nodules_csv.rename(columns = {'SN':'ID'})
labels= nodules.iloc[:,1]
labels = labels.to_numpy()
class DataGenerator(Sequence):
# Learned from https://mahmoudyusof.github.io/facial-keypoint-detection/data-generator/
def __init__(self, all_image_paths, labels, base_dir, output_size, shuffle=False, batch_size=10):
"""
Initializes a data generator object
:param csv_file: file in which image names and numeric labels are stored
:param base_dir: the directory in which all images are stored
:param output_size: image output size after preprocessing
:param shuffle: shuffle the data after each epoch
:param batch_size: The size of each batch returned by __getitem__
"""
self.imgs = all_image_paths
self.base_dir = base_dir
self.output_size = output_size
self.shuffle = shuffle
self.batch_size = batch_size
self.labels = labels
self.on_epoch_end()
def on_epoch_end(self):
self.indices = np.arange(len(self.imgs))
if self.shuffle:
np.random.shuffle(self.indices)
def __len__(self):
return int(len(self.imgs) / self.batch_size)
def __getitem__(self, idx):
## Initializing Batch
# that one in the shape is just for a one channel images
# if you want to use colored images you might want to set that to 3
X = np.empty((self.batch_size, *self.output_size,1))
# (x, y, h, w)
y = np.empty((self.batch_size, 1))
# get the indices of the requested batch
indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
for i, data_index in enumerate(indices):
img_path = os.path.join(self.base_dir,
self.imgs[data_index])
img = np.load(img_path)
while img.shape == (31,31,31):
img = np.expand_dims(img, axis=3)
## this is where you preprocess the image
## make sure to resize it to be self.output_size
label = self.labels[data_index]
## if you have any preprocessing for
## the labels too do it here
X[i,] = img
y[i] = label
return X, y
## Defining and training the model
model = Sequential([
## define the model's architecture
layers.Conv3D(filters=32, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=32, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.Conv3D(filters=64, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=64, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.Conv3D(filters=128, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=128, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.Conv3D(filters=256, kernel_size=3, activation="relu", padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=256, kernel_size=3, activation="relu", padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.GlobalAveragePooling3D(),
layers.Dense(units=512, activation="relu"),
layers.BatchNormalization(),
layers.Dropout(0.3),
layers.Dense(units=1, activation="sigmoid"),
])
train_gen = DataGenerator(all_image_paths, labels, base_dir, (31, 31, 31), batch_size=128, shuffle=False)
## compile the model first of course
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy', 'Precision', 'Recall', 'FalseNegatives', 'FalsePositives', 'TrueNegatives', 'TruePositives'])
model.build(input_shape= (128,31,31,31,1))
model.summary()
# now let's train the model
history = model.fit(train_gen, epochs=25)
and the results below:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv3d (Conv3D) (128, 31, 31, 31, 32) 896
batch_normalization (BatchN (128, 31, 31, 31, 32) 128
ormalization)
conv3d_1 (Conv3D) (128, 31, 31, 31, 32) 27680
batch_normalization_1 (Batc (128, 31, 31, 31, 32) 128
hNormalization)
max_pooling3d (MaxPooling3D (128, 15, 15, 15, 32) 0
)
batch_normalization_2 (Batc (128, 15, 15, 15, 32) 128
hNormalization)
conv3d_2 (Conv3D) (128, 15, 15, 15, 64) 55360
batch_normalization_3 (Batc (128, 15, 15, 15, 64) 256
hNormalization)
conv3d_3 (Conv3D) (128, 15, 15, 15, 64) 110656
batch_normalization_4 (Batc (128, 15, 15, 15, 64) 256
hNormalization)
max_pooling3d_1 (MaxPooling (128, 7, 7, 7, 64) 0
3D)
batch_normalization_5 (Batc (128, 7, 7, 7, 64) 256
hNormalization)
conv3d_4 (Conv3D) (128, 7, 7, 7, 128) 221312
batch_normalization_6 (Batc (128, 7, 7, 7, 128) 512
hNormalization)
conv3d_5 (Conv3D) (128, 7, 7, 7, 128) 442496
batch_normalization_7 (Batc (128, 7, 7, 7, 128) 512
hNormalization)
max_pooling3d_2 (MaxPooling (128, 3, 3, 3, 128) 0
3D)
batch_normalization_8 (Batc (128, 3, 3, 3, 128) 512
hNormalization)
conv3d_6 (Conv3D) (128, 3, 3, 3, 256) 884992
batch_normalization_9 (Batc (128, 3, 3, 3, 256) 1024
hNormalization)
conv3d_7 (Conv3D) (128, 3, 3, 3, 256) 1769728
batch_normalization_10 (Bat (128, 3, 3, 3, 256) 1024
chNormalization)
max_pooling3d_3 (MaxPooling (128, 1, 1, 1, 256) 0
3D)
batch_normalization_11 (Bat (128, 1, 1, 1, 256) 1024
chNormalization)
global_average_pooling3d (G (128, 256) 0
lobalAveragePooling3D)
dense (Dense) (128, 512) 131584
batch_normalization_12 (Bat (128, 512) 2048
chNormalization)
dropout (Dropout) (128, 512) 0
dense_1 (Dense) (128, 1) 513
=================================================================
Total params: 3,653,025
Trainable params: 3,649,121
Non-trainable params: 3,904
_________________________________________________________________
Epoch 1/25
2022-12-15 17:46:04.897341: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401
2022-12-15 17:46:05.829836: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-12-15 17:46:06.464508: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-12-15 17:46:07.214021: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2319ed30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-12-15 17:46:07.214054: I tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2022-12-15 17:46:07.217900: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-12-15 17:46:07.277629: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-12-15 17:46:07.317843: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
5898/5898 [==============================] - 1184s 199ms/step - loss: 0.0203 - accuracy: 0.9956 - precision: 0.0807 - recall: 0.1113 - false_negatives: 1381.0000 - false_positives: 1972.0000 - true_negatives: 751418.0000 - true_positives: 173.0000
Epoch 2/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0068 - accuracy: 0.9984 - precision: 0.6869 - recall: 0.3779 - false_negatives: 968.0000 - false_positives: 268.0000 - true_negatives: 753120.0000 - true_positives: 588.0000
Epoch 3/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0052 - accuracy: 0.9986 - precision: 0.7472 - recall: 0.4782 - false_negatives: 813.0000 - false_positives: 252.0000 - true_negatives: 753134.0000 - true_positives: 745.0000
Epoch 4/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0045 - accuracy: 0.9987 - precision: 0.7676 - recall: 0.5540 - false_negatives: 694.0000 - false_positives: 261.0000 - true_negatives: 753127.0000 - true_positives: 862.0000
Epoch 5/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0039 - accuracy: 0.9988 - precision: 0.7913 - recall: 0.5963 - false_negatives: 629.0000 - false_positives: 245.0000 - true_negatives: 753141.0000 - true_positives: 929.0000
Epoch 6/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0033 - accuracy: 0.9990 - precision: 0.8080 - recall: 0.6465 - false_negatives: 550.0000 - false_positives: 239.0000 - true_negatives: 753149.0000 - true_positives: 1006.0000
Epoch 7/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0029 - accuracy: 0.9990 - precision: 0.8178 - recall: 0.6913 - false_negatives: 481.0000 - false_positives: 240.0000 - true_negatives: 753146.0000 - true_positives: 1077.0000
Epoch 8/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0024 - accuracy: 0.9992 - precision: 0.8452 - recall: 0.7530 - false_negatives: 385.0000 - false_positives: 215.0000 - true_negatives: 753170.0000 - true_positives: 1174.0000
Epoch 9/25
5898/5898 [==============================] - 1177s 200ms/step - loss: 0.0018 - accuracy: 0.9993 - precision: 0.8632 - recall: 0.8077 - false_negatives: 299.0000 - false_positives: 199.0000 - true_negatives: 753190.0000 - true_positives: 1256.0000
Epoch 10/25
5898/5898 [==============================] - 1180s 200ms/step - loss: 0.0014 - accuracy: 0.9995 - precision: 0.9055 - recall: 0.8508 - false_negatives: 232.0000 - false_positives: 138.0000 - true_negatives: 753251.0000 - true_positives: 1323.0000
Epoch 11/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0014 - accuracy: 0.9995 - precision: 0.9086 - recall: 0.8678 - false_negatives: 206.0000 - false_positives: 136.0000 - true_negatives: 753250.0000 - true_positives: 1352.0000
Epoch 12/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0011 - accuracy: 0.9996 - precision: 0.9207 - recall: 0.8952 - false_negatives: 163.0000 - false_positives: 120.0000 - true_negatives: 753268.0000 - true_positives: 1393.0000
Epoch 13/25
5898/5898 [==============================] - 1182s 200ms/step - loss: 8.5650e-04 - accuracy: 0.9997 - precision: 0.9382 - recall: 0.9177 - false_negatives: 128.0000 - false_positives: 94.0000 - true_negatives: 753294.0000 - true_positives: 1428.0000
Epoch 14/25
5898/5898 [==============================] - 1179s 200ms/step - loss: 7.9298e-04 - accuracy: 0.9998 - precision: 0.9509 - recall: 0.9326 - false_negatives: 105.0000 - false_positives: 75.0000 - true_negatives: 753312.0000 - true_positives: 1452.0000
Epoch 15/25
5898/5898 [==============================] - 1179s 200ms/step - loss: 7.1897e-04 - accuracy: 0.9998 - precision: 0.9576 - recall: 0.9422 - false_negatives: 90.0000 - false_positives: 65.0000 - true_negatives: 753322.0000 - true_positives: 1467.0000
Epoch 16/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 6.0985e-04 - accuracy: 0.9998 - precision: 0.9567 - recall: 0.9499 - false_negatives: 78.0000 - false_positives: 67.0000 - true_negatives: 753320.0000 - true_positives: 1479.0000
Epoch 17/25
5898/5898 [==============================] - 1182s 200ms/step - loss: 6.1805e-04 - accuracy: 0.9998 - precision: 0.9648 - recall: 0.9499 - false_negatives: 78.0000 - false_positives: 54.0000 - true_negatives: 753332.0000 - true_positives: 1480.0000
Epoch 18/25
5898/5898 [==============================] - 1182s 200ms/step - loss: 4.7617e-04 - accuracy: 0.9998 - precision: 0.9657 - recall: 0.9595 - false_negatives: 63.0000 - false_positives: 53.0000 - true_negatives: 753336.0000 - true_positives: 1492.0000
Epoch 19/25
5898/5898 [==============================] - 1196s 203ms/step - loss: 5.4637e-04 - accuracy: 0.9998 - precision: 0.9637 - recall: 0.9563 - false_negatives: 68.0000 - false_positives: 56.0000 - true_negatives: 753332.0000 - true_positives: 1488.0000
Epoch 20/25
5898/5898 [==============================] - 1748s 296ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1557.0000 - false_positives: 0.0000e+00 - true_negatives: 753387.0000 - true_positives: 0.0000e+00
Epoch 21/25
5898/5898 [==============================] - 1150s 195ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1557.0000 - false_positives: 0.0000e+00 - true_negatives: 753387.0000 - true_positives: 0.0000e+00
Epoch 22/25
5898/5898 [==============================] - 1145s 194ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1558.0000 - false_positives: 0.0000e+00 - true_negatives: 753386.0000 - true_positives: 0.0000e+00
Epoch 23/25
5898/5898 [==============================] - 1145s 194ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1555.0000 - false_positives: 0.0000e+00 - true_negatives: 753389.0000 - true_positives: 0.0000e+00
Epoch 24/25
5898/5898 [==============================] - 1146s 194ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1558.0000 - false_positives: 0.0000e+00 - true_negatives: 753386.0000 - true_positives: 0.0000e+00
Epoch 25/25
5898/5898 [==============================] - 1148s 195ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1557.0000 - false_positives: 0.0000e+00 - true_negatives: 753387.0000 - true_positives: 0.0000e+00
I think this is a gradient exploding.
I've searched the other questions but most of the solutions I already implemented but no luck :
Loss of CNN in Keras becomes nan at some point of training
Why the total loss of a Deep Learning model can abruptly become NaN using Keras (Python)?
I've tried to do regularization (Dropout after every layer, L1, L2) but the reult was poor model (the model didn't learned anything). I've tried adam decay learning rate, batch norm, also didn't solve the problem.

I think this is a gradient exploding.
No. Usually exploding gradients has the loss get worse before returning NaNs.
layers.Dense(units=1, activation="sigmoid")
I thought keras had protections against this, but it's possible the problem here is that the naive loss calculation of sigmoid crossentropy will diverge outside of very limited input range.
Try:
layers.Dense(units=1)
...
model.compile(..., loss=keras.losses.BinaryCrossentropy(from_logits=True))
LMK if that works.

Related

L2 regularizer in tensorflow v2

The following model is defined in TF1, I am trying to migrate it to TF2 without using compat API.
# Define the tensorflow neural network
# 1. Input:
self.input_states = tf.placeholder(
tf.float32, shape=[None, 4, board_height, board_width])
self.input_state = tf.transpose(self.input_states, [0, 2, 3, 1])
# 2. Common Networks Layers
self.conv1 = tf.layers.conv2d(inputs=self.input_state,
filters=32, kernel_size=[3, 3],
padding="same", data_format="channels_last",
activation=tf.nn.relu)
self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64,
kernel_size=[3, 3], padding="same",
data_format="channels_last",
activation=tf.nn.relu)
self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=128,
kernel_size=[3, 3], padding="same",
data_format="channels_last",
activation=tf.nn.relu)
# 3-1 Action Networks
self.action_conv = tf.layers.conv2d(inputs=self.conv3, filters=4,
kernel_size=[1, 1], padding="same",
data_format="channels_last",
activation=tf.nn.relu)
# Flatten the tensor
self.action_conv_flat = tf.reshape(
self.action_conv, [-1, 4 * board_height * board_width])
# 3-2 Full connected layer, the output is the log probability of moves
# on each slot on the board
self.action_fc = tf.layers.dense(inputs=self.action_conv_flat,
units=board_height * board_width,
activation=tf.nn.log_softmax)
# 4 Evaluation Networks
self.evaluation_conv = tf.layers.conv2d(inputs=self.conv3, filters=2,
kernel_size=[1, 1],
padding="same",
data_format="channels_last",
activation=tf.nn.relu)
self.evaluation_conv_flat = tf.reshape(
self.evaluation_conv, [-1, 2 * board_height * board_width])
self.evaluation_fc1 = tf.layers.dense(inputs=self.evaluation_conv_flat,
units=64, activation=tf.nn.relu)
# output the score of evaluation on current state
self.evaluation_fc2 = tf.layers.dense(inputs=self.evaluation_fc1,
units=1, activation=tf.nn.tanh)
# Define the Loss function
# 1. Label: the array containing if the game wins or not for each state
self.labels = tf.placeholder(tf.float32, shape=[None, 1])
# 2. Predictions: the array containing the evaluation score of each state
# which is self.evaluation_fc2
# 3-1. Value Loss function
self.value_loss = tf.losses.mean_squared_error(self.labels,
self.evaluation_fc2)
# 3-2. Policy Loss function
self.mcts_probs = tf.placeholder(
tf.float32, shape=[None, board_height * board_width])
self.policy_loss = tf.negative(tf.reduce_mean(
tf.reduce_sum(tf.multiply(self.mcts_probs, self.action_fc), 1)))
# 3-3. L2 penalty (regularization)
l2_penalty_beta = 1e-4
vars = tf.trainable_variables()
l2_penalty = l2_penalty_beta * tf.add_n(
[tf.nn.l2_loss(v) for v in vars if 'bias' not in v.name.lower()])
# 3-4 Add up to be the Loss function
self.loss = self.value_loss + self.policy_loss + l2_penalty
# Define the optimizer we use for training
self.learning_rate = tf.placeholder(tf.float32)
self.optimizer = tf.train.AdamOptimizer(
learning_rate=self.learning_rate).minimize(self.loss)
And here is my TF2 code
l2_penalty_beta = 1e-4
# Define the tensorflow neural network
# 1. Input:
self.inputs = tf.keras.Input( shape=(4, board_height, board_width), dtype=tf.dtypes.float32)
self.transposed_inputs = tf.keras.layers.Lambda( lambda x: tf.transpose(x, [0, 2, 3, 1]) )(self.inputs)
# 2. Common Networks Layers
self.conv1 = tf.keras.layers.Conv2D( name="conv1",
filters=32,
kernel_size=(3, 3),
padding="same",
data_format="channels_last",
activation=tf.keras.activations.relu,
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.transposed_inputs)
self.conv2 = tf.keras.layers.Conv2D( name="conv2",
filters=64,
kernel_size=(3, 3),
padding="same",
data_format="channels_last",
activation=tf.keras.activations.relu,
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.conv1)
self.conv3 = tf.keras.layers.Conv2D( name="conv3",
filters=128,
kernel_size=(3, 3),
padding="same",
data_format="channels_last",
activation=tf.keras.activations.relu,
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.conv2)
# 3-1 Action Networks
self.action_conv = tf.keras.layers.Conv2D( name="action_conv",
filters=4,
kernel_size=(1, 1),
padding="same",
data_format="channels_last",
activation=tf.keras.activations.relu,
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.conv3)
# flatten tensor
self.action_conv_flat = tf.keras.layers.Reshape( (-1, 4 * board_height * board_width), name="action_conv_flat" )(self.action_conv)
# 3-2 Full connected layer, the output is the log probability of moves
# on each slot on the board
self.action_fc = tf.keras.layers.Dense( board_height * board_width,
activation=tf.nn.log_softmax,
name="action_fc",
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.action_conv_flat)
# 4 Evaluation Networks
self.evaluation_conv = tf.keras.layers.Conv2D( name="evaluation_conv",
filters=2,
kernel_size=(1, 1),
padding="same",
data_format="channels_last",
activation=tf.keras.activations.relu,
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.conv3)
self.evaluation_conv_flat = tf.keras.layers.Reshape( (-1, 2 * board_height * board_width),
name="evaluation_conv_flat" )(self.evaluation_conv)
self.evaluation_fc1 = tf.keras.layers.Dense( 64,
activation=tf.keras.activations.relu,
name="evaluation_fc1",
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.evaluation_conv_flat)
self.evaluation_fc2 = tf.keras.layers.Dense( 1,
activation=tf.keras.activations.tanh,
name="evaluation_fc2",
kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta))(self.evaluation_fc1)
self.outputs = tf.keras.layers.Concatenate()([self.action_fc, self.evaluation_fc2])
self.model = tf.keras.Model(inputs=self.inputs, outputs=self.outputs, name="policy_value_model")
self.model.summary()
def custom_loss(labels, predictions):
expected_act_probs, expected_value = tf.split(labels, [self.board_height*self.board_width, -1], axis=2)
pred_act_probs, pred_value = tf.split(predictions, [self.board_height*self.board_width, -1], axis=2)
value_loss = tf.losses.mean_squared_error(expected_value[0], pred_value[0])
policy_loss = tf.negative(tf.reduce_mean(
tf.reduce_sum(tf.multiply(expected_act_probs, pred_act_probs), 1)))
return value_loss + policy_loss
#print(tf.autograph.to_code(custom_loss))
self.model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.function(custom_loss),
metrics=['accuracy'])
Here is summary of this model.
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 4, 15, 15)] 0 []
lambda (Lambda) (None, 15, 15, 4) 0 ['input_1[0][0]']
conv1 (Conv2D) (None, 15, 15, 32) 1184 ['lambda[0][0]']
conv2 (Conv2D) (None, 15, 15, 64) 18496 ['conv1[0][0]']
conv3 (Conv2D) (None, 15, 15, 128) 73856 ['conv2[0][0]']
evaluation_conv (Conv2D) (None, 15, 15, 2) 258 ['conv3[0][0]']
action_conv (Conv2D) (None, 15, 15, 4) 516 ['conv3[0][0]']
evaluation_conv_flat (Reshape) (None, 1, 450) 0 ['evaluation_conv[0][0]']
action_conv_flat (Reshape) (None, 1, 900) 0 ['action_conv[0][0]']
evaluation_fc1 (Dense) (None, 1, 64) 28864 ['evaluation_conv_flat[0][0]']
action_fc (Dense) (None, 1, 225) 202725 ['action_conv_flat[0][0]']
evaluation_fc2 (Dense) (None, 1, 1) 65 ['evaluation_fc1[0][0]']
concatenate (Concatenate) (None, 1, 226) 0 ['action_fc[0][0]',
'evaluation_fc2[0][0]']
==================================================================================================
As you can notice, the TF1 model outputs two tensors (action_fc and evaluation_fc2). And in TF2 model I added a concatenate layer in the last to combine them into a single tensor so that I can have a single loss function on them.
The loss function in TF1 model sum up three parts. The l2_penalty is the L2 lost of all weights in the model.
l2_penalty_beta = 1e-4
vars = tf.trainable_variables()
l2_penalty = l2_penalty_beta * tf.add_n(
[tf.nn.l2_loss(v) for v in vars if 'bias' not in v.name.lower()])
self.loss = self.value_loss + self.policy_loss + l2_penalty
In TF2 model, all trainable layers are added kernel_regularizer=tf.keras.regularizers.L2(l2_penalty_beta). Is that the same as TF1 model? Or do I make some mistake?
The TF2 model will be executed in an environment without Python interpreter, that is to say the model will be compiled into graph. I guess my loss function has to be stateless -- it must not rely on any varialble out of the function scope.
UPDATE:
Here is the training logs from the fress TF2 model. The loss looks abnormal. It is too small at the start. The loss is greater than 4.0 in TF1 model. Will the L2 regularizer be part of loss in TF2?
batch i:2, episode_len:113
1/1 - 0s - loss: 0.7342 - accuracy: 0.0000e+00 - 18ms/epoch - 18ms/step
1/1 - 0s - loss: 0.4714 - accuracy: 0.0000e+00 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.1785 - accuracy: 0.0000e+00 - 14ms/epoch - 14ms/step
1/1 - 0s - loss: 0.0829 - accuracy: 0.0000e+00 - 30ms/epoch - 30ms/step
1/1 - 0s - loss: 0.0743 - accuracy: 0.0000e+00 - 13ms/epoch - 13ms/step
kl:0.00178,lr_multiplier:2.250,loss:[0.07430928945541382]
batch i:3, episode_len:92
1/1 - 0s - loss: 0.0764 - accuracy: 1.0000 - 19ms/epoch - 19ms/step
1/1 - 0s - loss: 0.0718 - accuracy: 1.0000 - 28ms/epoch - 28ms/step
1/1 - 0s - loss: 0.0705 - accuracy: 1.0000 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.0693 - accuracy: 1.0000 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.0681 - accuracy: 1.0000 - 34ms/epoch - 34ms/step
kl:0.00068,lr_multiplier:3.375,loss:[0.06813239306211472]
batch i:4, episode_len:118
1/1 - 0s - loss: 0.0676 - accuracy: 1.0000 - 16ms/epoch - 16ms/step
1/1 - 0s - loss: 0.0665 - accuracy: 1.0000 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.0654 - accuracy: 1.0000 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.0643 - accuracy: 1.0000 - 15ms/epoch - 15ms/step
1/1 - 0s - loss: 0.0631 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
kl:0.00425,lr_multiplier:5.062,loss:[0.06307009607553482]
batch i:5, episode_len:84
1/1 - 0s - loss: 4.0628 - accuracy: 0.0000e+00 - 17ms/epoch - 17ms/step
1/1 - 0s - loss: 4.0618 - accuracy: 0.0000e+00 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 4.0606 - accuracy: 0.0000e+00 - 16ms/epoch - 16ms/step
1/1 - 0s - loss: 4.0592 - accuracy: 0.0000e+00 - 11ms/epoch - 11ms/step
1/1 - 0s - loss: 4.0577 - accuracy: 0.0000e+00 - 13ms/epoch - 13ms/step
kl:0.07804,lr_multiplier:3.375,loss:[4.057666778564453]
batch i:6, episode_len:96
1/1 - 0s - loss: 0.0599 - accuracy: 1.0000 - 17ms/epoch - 17ms/step
1/1 - 0s - loss: 0.0589 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
1/1 - 0s - loss: 0.0579 - accuracy: 1.0000 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.0568 - accuracy: 1.0000 - 31ms/epoch - 31ms/step
1/1 - 0s - loss: 0.0556 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
kl:0.06898,lr_multiplier:2.250,loss:[0.055556993931531906]
batch i:7, episode_len:62
1/1 - 0s - loss: 0.0577 - accuracy: 1.0000 - 28ms/epoch - 28ms/step
1/1 - 0s - loss: 0.0569 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
1/1 - 0s - loss: 0.0559 - accuracy: 1.0000 - 14ms/epoch - 14ms/step
1/1 - 0s - loss: 0.0549 - accuracy: 1.0000 - 16ms/epoch - 16ms/step
1/1 - 0s - loss: 0.0538 - accuracy: 1.0000 - 19ms/epoch - 19ms/step
kl:0.03346,lr_multiplier:2.250,loss:[0.05379907414317131]
batch i:8, episode_len:118
1/1 - 0s - loss: 0.0552 - accuracy: 1.0000 - 28ms/epoch - 28ms/step
1/1 - 0s - loss: 0.0543 - accuracy: 1.0000 - 48ms/epoch - 48ms/step
1/1 - 0s - loss: 0.0532 - accuracy: 1.0000 - 12ms/epoch - 12ms/step
1/1 - 0s - loss: 0.0521 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
1/1 - 0s - loss: 0.0510 - accuracy: 1.0000 - 11ms/epoch - 11ms/step
kl:0.04336,lr_multiplier:1.500,loss:[0.051018256694078445]
Yes, I think that's the right way to add L2 regularization to the weights (but not the biases) in TF2
My only thought is that you don't have to concatenate the outputs tensors in TF2, you can define
self.outputs = [self.action_fc, self.evaluation_fc2]
if that's easier. (Then also specify your losses as a length 2 list etc.).

Loss not changing and accuracy remains 0 after calling fit()

I'm new to keras and tensorflow, I have a model that I am trying to train where the loss does not change after epoch #1.
my data is the sequence of numbers which I want NN to learn and predict the next number:
data[10:15]
Out[3]:
array([[30, 36, 28, 25, 30, 35],
[36, 28, 25, 30, 35, 28],
[28, 25, 30, 35, 28, 29],
[25, 30, 35, 28, 29, 25],
[30, 35, 28, 29, 25, 38]])
For example I want [30, 36, 28, 25, 30] to be my input and 35 to be my output.
and this is my very simple code and NN:
[I normalized all my data using StandardScaler() but it didn't change.]
data = gen_train_data()
scaler = StandardScaler()
data = scaler.fit_transform(data)
X_train_full, X_test, y_train_full, y_test = train_test_split(data[:, 0:5], data[:, 5])
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full)
model = keras.models.Sequential()
model.add(keras.layers.Dense(5))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(30, activation="elu", kernel_initializer="he_normal"))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1, activation="softmax"))
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="mse", optimizer=optimizer, metrics=["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train,
epochs=16, validation_data=(X_valid, y_valid),
callbacks=[early_stopping_cb])
mse_test = model.evaluate(X_test, y_test)
and this is my console after running code above:
Epoch 1/16
1/26 [>.............................] - ETA: 57s - loss: 3.2932 - accuracy: 0.0000e+00
26/26 [==============================] - 3s 23ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 2/16
26/26 [==============================] - 0s 4ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 3/16
26/26 [==============================] - 0s 4ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 4/16
26/26 [==============================] - 0s 5ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 5/16
26/26 [==============================] - 0s 5ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 6/16
26/26 [==============================] - 0s 5ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 7/16
26/26 [==============================] - 0s 4ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 8/16
26/26 [==============================] - 0s 5ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 9/16
26/26 [==============================] - 0s 5ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 10/16
26/26 [==============================] - 0s 5ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
Epoch 11/16
26/26 [==============================] - 0s 6ms/step - loss: 2.0375 - accuracy: 0.0000e+00 - val_loss: 2.2258 - val_accuracy: 0.0000e+00
12/12 [==============================] - 0s 3ms/step - loss: 1.7458 - accuracy: 0.0000e+00
and the prediction is 1 for all inputs
model.predict(X_test[:5])
Out[2]:
array([[1.],
[1.],
[1.],
[1.],
[1.]], dtype=float32)
I tried everything (activation functions, learning rates, more/less hidden layers, ...), nothing changes the output.
I would really appreciate it if someone can help me

Not stable training in CNN

I am currently doing a image classfication which takes up to 81000 data to train the model(CNN). It has a very not stable val_training accuracy and val_accuracy between each epochs. I have done data augmentation for the training as well. Here is the training result for loss and accuracy. It goes all the way to 100 epochs with the same flow like this. Should I change my regularization method or the architecture of my network?
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_1 (Conv2D) (None, 128, 128, 32) 896
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 32) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 64, 64, 32) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 64, 64, 64) 18496
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 32, 32, 64) 0
_________________________________________________________________
dropout_2 (Dropout) (None, 32, 32, 64) 0
_________________________________________________________________
conv2d_3 (Conv2D) (None, 32, 32, 64) 36928
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 16, 16, 64) 0
_________________________________________________________________
dropout_3 (Dropout) (None, 16, 16, 64) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, 16, 16, 128) 73856
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 8, 8, 128) 0
_________________________________________________________________
dropout_4 (Dropout) (None, 8, 8, 128) 0
_________________________________________________________________
flatten_1 (Flatten) (None, 8192) 0
_________________________________________________________________
dense_1 (Dense) (None, 512) 4194816
_________________________________________________________________
dropout_5 (Dropout) (None, 512) 0
_________________________________________________________________
dense_2 (Dense) (None, 162) 83106
=================================================================
Total params: 4,408,098
Trainable params: 4,408,098
Non-trainable params: 0
Epoch 1/100
2430/2430 [==============================] - 2817s 1s/step - loss: 4.0652 - accuracy: 0.1137 - val_loss: 2.5681 - val_accuracy: 0.3385
Epoch 2/100
2430/2430 [==============================] - 2263s 931ms/step - loss: 2.1216 - accuracy: 0.4463 - val_loss: 2.1476 - val_accuracy: 0.5516
Epoch 3/100
2430/2430 [==============================] - 2250s 926ms/step - loss: 1.4840 - accuracy: 0.5960 - val_loss: 1.4907 - val_accuracy: 0.5631
Epoch 4/100
2430/2430 [==============================] - 2705s 1s/step - loss: 1.1820 - accuracy: 0.6690 - val_loss: 1.0003 - val_accuracy: 0.6717
Epoch 5/100
2430/2430 [==============================] - 2470s 1s/step - loss: 0.9978 - accuracy: 0.7211 - val_loss: 0.7172 - val_accuracy: 0.7038
Epoch 6/100
2430/2430 [==============================] - 2850s 1s/step - loss: 0.8731 - accuracy: 0.7522 - val_loss: 0.7637 - val_accuracy: 0.7460
Epoch 7/100
2430/2430 [==============================] - 2819s 1s/step - loss: 0.7883 - accuracy: 0.7748 - val_loss: 0.7909 - val_accuracy: 0.7278
Epoch 8/100
2430/2430 [==============================] - 2725s 1s/step - loss: 0.7235 - accuracy: 0.7939 - val_loss: 0.7154 - val_accuracy: 0.7369
Epoch 9/100
2430/2430 [==============================] - 2642s 1s/step - loss: 0.6703 - accuracy: 0.8062 - val_loss: 0.6727 - val_accuracy: 0.7158
Epoch 10/100
2430/2430 [==============================] - 2673s 1s/step - loss: 0.6331 - accuracy: 0.8163 - val_loss: 0.9074 - val_accuracy: 0.7794
Epoch 11/100
2430/2430 [==============================] - 2517s 1s/step - loss: 0.5998 - accuracy: 0.8283 - val_loss: 0.3628 - val_accuracy: 0.8017
Epoch 12/100
2430/2430 [==============================] - 2537s 1s/step - loss: 0.5726 - accuracy: 0.8366 - val_loss: 0.3375 - val_accuracy: 0.7677
Epoch 13/100
2430/2430 [==============================] - 2788s 1s/step - loss: 0.5540 - accuracy: 0.8380 - val_loss: 0.9867 - val_accuracy: 0.7475
Epoch 14/100
2430/2430 [==============================] - 2575s 1s/step - loss: 0.5289 - accuracy: 0.8467 - val_loss: 1.2910 - val_accuracy: 0.7871
Epoch 15/100
2430/2430 [==============================] - 2720s 1s/step - loss: 0.5085 - accuracy: 0.8522 - val_loss: 0.4738 - val_accuracy: 0.8069
Epoch 16/100
2430/2430 [==============================] - 2880s 1s/step - loss: 0.4929 - accuracy: 0.8563 - val_loss: 0.3417 - val_accuracy: 0.8237
Epoch 17/100
2430/2430 [==============================] - 2587s 1s/step - loss: 0.4900 - accuracy: 0.8571 - val_loss: 0.3708 - val_accuracy: 0.8212
Epoch 18/100
2430/2430 [==============================] - 2603s 1s/step - loss: 0.4826 - accuracy: 0.8600 - val_loss: 0.9994 - val_accuracy: 0.7801
Epoch 19/100
2430/2430 [==============================] - 2792s 1s/step - loss: 0.4728 - accuracy: 0.8630 - val_loss: 0.4388 - val_accuracy: 0.8108
Epoch 20/100
2430/2430 [==============================] - 2450s 1s/step - loss: 0.4510 - accuracy: 0.8700 - val_loss: 0.6080 - val_accuracy: 0.7988
Epoch 21/100
2430/2430 [==============================] - 2516s 1s/step - loss: 0.4571 - accuracy: 0.8666 - val_loss: 0.4918 - val_accuracy: 0.7780
It looks to me like a pretty normal training of a CNN. Your model is basically reaching what is known as a local minimum!
This means it is basically bouncing back and forth over optimal model parameters without ever getting it exactly right. This is because your learning rate might be too high.
If you plan on training for a long long time you can simply lower the learning rate and increase the epochs, or even better you can use a custom learning rate scheduler.
Specifically exponential decay could work well.
If you are using tensorflow/keras, take a look here --- Exponential Decay Optimizer
You pass can define this when you compile your model like this
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])

Tensorflow Model I/O question: Failed to find data adapter that can handle input

This is from the Keras document example: Train a model to calculate the priority_score and which department to forward for an email.
I implement the model in another way, I can compile it but I cannot train the model. I guess it a model I/O issue, i.e I need to feed the correct format of the I/O data.
ValueError: Failed to find data adapter that can handle input: (<class 'dict'> containing {"<class 'str'>"} keys and {"<class 'numpy.ndarray'>", '(<class \'list\'> containing values of types {"<class \'str\'>"})'} values), (<class 'dict'> containing {"<class 'str'>"} keys and {"<class 'numpy.ndarray'>"} values)
Its too long so I didn't put it into this post's title.
My model has 3 inputs:
title_input: It supposed to be a single string
body_input: It supposed to be a single string
tags_input: An array of 12 0s or 1s. For example, [0,1,0,1,0,0,0,0,0,1]
The output are:
priority: A float
departments: An array of 4 0,1s.
Questions
Can anyone tell me what's wrong in my code?
And generally speaking, how should I think about the I/O of a model? Such as this case. I thought preparing N strings, such as 800 strings, and 800 tags is OK. But I keep getting errors. Well I solved most of them but couldn't overcome this one. Please share your experiences. Thanks!
Appendix
Model summary
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
tags_input (InputLayer) [(None, 12)] 0
__________________________________________________________________________________________________
flatten (Flatten) (None, 12) 0 tags_input[0][0]
__________________________________________________________________________________________________
title_input (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
body_input (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
dense (Dense) (None, 500) 6500 flatten[0][0]
__________________________________________________________________________________________________
text_vectorization (TextVectori (None, 500) 0 title_input[0][0]
__________________________________________________________________________________________________
text_vectorization_1 (TextVecto (None, 500) 0 body_input[0][0]
__________________________________________________________________________________________________
tf_op_layer_ExpandDims (TensorF [(None, 500, 1)] 0 dense[0][0]
__________________________________________________________________________________________________
embedding (Embedding) (None, 500, 100) 1000100 text_vectorization[0][0]
__________________________________________________________________________________________________
embedding_1 (Embedding) (None, 500, 100) 1000100 text_vectorization_1[0][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 500, 100) 200 tf_op_layer_ExpandDims[0][0]
__________________________________________________________________________________________________
concatenate (Concatenate) (None, 500, 300) 0 embedding[0][0]
embedding_1[0][0]
dense_1[0][0]
__________________________________________________________________________________________________
priority (Dense) (None, 500, 1) 301 concatenate[0][0]
__________________________________________________________________________________________________
departments (Dense) (None, 500, 4) 1204 concatenate[0][0]
==================================================================================================
Total params: 2,008,405
Trainable params: 2,008,405
Non-trainable params: 0
__________________________________________________________________________________________________
Full Code
def MultiInputAndOutpt():
max_features = 10000
sequnce_length = 500
embedding_dims = 100
num_departments = 4
num_tags = 12
str = "hello"
title_vect = TextVectorization(max_tokens=max_features, output_mode="int", output_sequence_length=sequnce_length)
body_vect = TextVectorization(max_tokens=max_features, output_mode="int", output_sequence_length=sequnce_length)
title_input = keras.Input(shape=(1,), dtype=tf.string, name="title_input")
x1 = title_vect(title_input)
x1 = layers.Embedding(input_dim=max_features + 1, output_dim=embedding_dims)(x1)
body_input = keras.Input(shape=(1,), dtype=tf.string, name="body_input")
x2 = body_vect(body_input)
x2 = layers.Embedding(input_dim=max_features + 1, output_dim=embedding_dims)(x2)
tags_input = keras.Input(shape=(num_tags,), name="tags_input")
x3 = layers.Flatten()(tags_input)
x3 = layers.Dense(500)(x3)
x3 = tf.expand_dims(x3, axis=-1)
x3 = layers.Dense(100)(x3)
x = layers.concatenate([x1, x2, x3])
priority_score = layers.Dense(1)(x)
priority_score = tf.reshape(priority_score, (-1, 1), name="priority")
departments = layers.Dense(num_departments)(x)
departments = tf.reshape(departments, (-1, num_departments), name="departments")
model = keras.Model(inputs=[title_input, body_input, tags_input], outputs=[priority_score, departments])
model.summary()
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.1),
loss=[keras.losses.BinaryCrossentropy(from_logits=True),
keras.losses.CategoricalCrossentropy(from_logits=True)],
loss_weights=[1.0, 0.2],
)
# title_data = np.random.randint(num_words, size=(1280, 10))
# body_data = np.random.randint(num_words, size=(1280, 100))
alphabet = np.array(list(string.ascii_lowercase + ' '))
title_data = np.random.choice(alphabet, size=(800, 1000))
body_data = np.random.choice(alphabet, size=(800, 1000))
tags_data = np.random.randint(2, size=(800, num_tags)).astype("float32")
body_data = ["".join(body_data[i]) for i in range(len(body_data))]
title_data = ["".join(title_data[i]) for i in range(len(title_data))]
# Dummy target data
priority_targets = np.random.random(size=(800, 1))
dept_targets = np.random.randint(2, size=(800, num_departments))
model.fit(
{"title_input": title_data, "body_input": body_data, "input3": tags_data},
{"priority": priority_targets, "departments": dept_targets},
epochs=2,
batch_size=32, )
I figured out by myself:
Input
The inputs of the model are correct. I don't need the flatten though.
tags_input (InputLayer) [(None, 12)] 0
__________________________________________________________________________________________________
title_input (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
body_input (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
output
The outputs are not correct, I don't want (None, 500, 1) and (None, 500, 4) as the output. I only need 1 priority score and the 1 department list of 4 values.
To change the shape from (None, 500, 1) to (None, 1) I need to drop some values. There are many ways do this, here I chose to drop the middle dim directly.
...
departments = layers.Dense(num_departments)(x) # Shape: (None, 500, 4)
departments = tf.slice(departments, [0, 0, 0], [-1, 1, 4]) # Shape (None, 1, 4)
departments = tf.squeeze(departments, [1]) # Shape (None, 4) but its not a squeeze type
departments = layers.Dense(num_departments, name="departments")(departments) # Shape (None, 4)
...
Same to the priority_score output.
And now the outputs become
priority_score (Dense) (None, 1) 2 tf.compat.v1.squeeze[0][0]
__________________________________________________________________________________________________
departments (Dense) (None, 4) 20 tf.compat.v1.squeeze_1[0][0]
Train model
The next step is to prepare the training data. What we need is to construct
title data: N strings, shape (N, 1). here 1 represent a python string.
body data: Same as tiele data
tags data: N array of floats, shape (N, 12)
Targets:
priority_score: N floats, shape (N, 1)
department: N array of floats, shape (N, 4)
where N can be any number.
Then we call the fit function:
model.fit(
{"title_input": title_data, "body_input": body_data, "tags_input": tags_data},
{"priority_score": priority_targets, "departments": dept_targets, },
epochs=50,
batch_size=64, )
Surprisingly the loss keeps growing:
Epoch 1/50
157/157 [==============================] - 5s 28ms/step - loss: 1.3467 - priority_score_loss: 0.6938 - departments_loss: 3.2644 - priority_score_acc: 0.0000e+00 - departments_acc: 0.1267
Epoch 2/50
157/157 [==============================] - 4s 27ms/step - loss: 4.6381 - priority_score_loss: 0.6976 - departments_loss: 19.7023 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2483
Epoch 3/50
157/157 [==============================] - 4s 28ms/step - loss: 16.9411 - priority_score_loss: 0.6984 - departments_loss: 81.2137 - priority_score_acc: 0.0000e+00 - departments_acc: 0.1569
Epoch 4/50
157/157 [==============================] - 5s 29ms/step - loss: 23.8020 - priority_score_loss: 0.7075 - departments_loss: 115.4721 - priority_score_acc: 0.0000e+00 - departments_acc: 0.1427
Epoch 5/50
157/157 [==============================] - 5s 29ms/step - loss: 1.8650 - priority_score_loss: 0.7046 - departments_loss: 5.8019 - priority_score_acc: 0.0000e+00 - departments_acc: 0.1995
Epoch 6/50
157/157 [==============================] - 5s 30ms/step - loss: 3.0613 - priority_score_loss: 0.7025 - departments_loss: 11.7943 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2472
Epoch 7/50
157/157 [==============================] - 5s 30ms/step - loss: 5.2455 - priority_score_loss: 0.7032 - departments_loss: 22.7114 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2402
Epoch 8/50
157/157 [==============================] - 5s 30ms/step - loss: 6.0378 - priority_score_loss: 0.7013 - departments_loss: 26.6828 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2418
Epoch 9/50
157/157 [==============================] - 5s 30ms/step - loss: 10.8300 - priority_score_loss: 0.7033 - departments_loss: 50.6334 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2465
Epoch 10/50
157/157 [==============================] - 4s 27ms/step - loss: 12.1005 - priority_score_loss: 0.7019 - departments_loss: 56.9929 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2627
Epoch 11/50
157/157 [==============================] - 4s 27ms/step - loss: 15.8248 - priority_score_loss: 0.6983 - departments_loss: 75.6328 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2513
Epoch 12/50
157/157 [==============================] - 5s 29ms/step - loss: 19.3059 - priority_score_loss: 0.6940 - departments_loss: 93.0596 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2386
Epoch 13/50
157/157 [==============================] - 5s 29ms/step - loss: 32.6499 - priority_score_loss: 0.6937 - departments_loss: 159.7808 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2526
Epoch 14/50
157/157 [==============================] - 4s 28ms/step - loss: 31.1433 - priority_score_loss: 0.6936 - departments_loss: 152.2486 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2499
Epoch 15/50
157/157 [==============================] - 5s 29ms/step - loss: 41.9199 - priority_score_loss: 0.6932 - departments_loss: 206.1338 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2362
Epoch 16/50
157/157 [==============================] - 5s 30ms/step - loss: 40.2069 - priority_score_loss: 0.6931 - departments_loss: 197.5692 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2300
Epoch 17/50
157/157 [==============================] - 5s 30ms/step - loss: 60.4129 - priority_score_loss: 0.6932 - departments_loss: 298.5986 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2425
Epoch 18/50
157/157 [==============================] - 5s 30ms/step - loss: 75.8330 - priority_score_loss: 0.6932 - departments_loss: 375.6990 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2332
Epoch 19/50
157/157 [==============================] - 5s 29ms/step - loss: 81.5731 - priority_score_loss: 0.6931 - departments_loss: 404.4002 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2568
Epoch 20/50
157/157 [==============================] - 4s 28ms/step - loss: 103.4053 - priority_score_loss: 0.6932 - departments_loss: 513.5608 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2409
Epoch 21/50
157/157 [==============================] - 4s 28ms/step - loss: 106.4842 - priority_score_loss: 0.6932 - departments_loss: 528.9552 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2584
Epoch 22/50
157/157 [==============================] - 4s 28ms/step - loss: 121.2103 - priority_score_loss: 0.6932 - departments_loss: 602.5854 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2332
Epoch 23/50
157/157 [==============================] - 5s 29ms/step - loss: 139.4970 - priority_score_loss: 0.6932 - departments_loss: 694.0189 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2421
Epoch 24/50
157/157 [==============================] - 5s 29ms/step - loss: 180.7346 - priority_score_loss: 0.6933 - departments_loss: 900.2067 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2449
Epoch 25/50
157/157 [==============================] - 4s 28ms/step - loss: 201.8011 - priority_score_loss: 0.6932 - departments_loss: 1005.5396 - priority_score_acc: 0.0000e+00 - departments_acc: 0.2420
Epoch 26/50
I guess this is because the training data is randomly generated, and the model is not well constructed. Anyway, we can train model and predict with some data now.
This was good learning experience.

input_shape with image_generator in Tensorflow

I'm trying to use this approach in Tensorflow 2.X to load large dataset that does not fit in memory.
I have a folder with X sub-folders that contains images. Each sub-folder is a class.
\dataset
-\class1
-img1_1.jpg
-img1_2.jpg
-...
-\classe2
-img2_1.jpg
-img2_2.jpg
-...
I create my data generator from my folder like this:
train_data_gen = image_generator.flow_from_directory(directory="path\\to\\dataset",
batch_size=100,
shuffle=True,
target_size=(100, 100), # Image H x W
classes=list(CLASS_NAMES)) # list of folder/class names ["class1", "class2", ...., "classX"]
Found 629 images belonging to 2 classes.
I've did a smaller dataset to test the pipeline. Only 629 images in 2 classes.
Now I can create a dummy model like this:
model = tf.keras.Sequential()
model.add(Dense(1, activation=activation, input_shape=(100, 100, 3))) # only 1 layer of 1 neuron
model.add(Dense(2)) # 2classes
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['categorical_accuracy'])
Once compile I try to fit this dummy model:
STEPS_PER_EPOCH = np.ceil(image_count / batch_size) # 629 / 100
model.fit_generator(generator=train_data_gen , steps_per_epoch=STEPS_PER_EPOCH, epochs=2, verbose=1)
1/7 [===>..........................] - ETA: 2s - loss: 1.1921e-07 - categorical_accuracy: 0.9948
2/7 [=======>......................] - ETA: 1s - loss: 1.1921e-07 - categorical_accuracy: 0.5124
3/7 [===========>..................] - ETA: 0s - loss: 1.1921e-07 - categorical_accuracy: 0.3449
4/7 [================>.............] - ETA: 0s - loss: 1.1921e-07 - categorical_accuracy: 0.2662
5/7 [====================>.........] - ETA: 0s - loss: 1.1921e-07 - categorical_accuracy: 0.2130
6/7 [========================>.....] - ETA: 0s - loss: 1.1921e-07 - categorical_accuracy: 0.1808
2020-04-14 20:39:48.629203: W tensorflow/core/framework/op_kernel.cc:1610] Invalid argument: ValueError: generator yielded an element of shape (29, 100, 100, 3) where an element of shape (100, 100, 100, 3) was expected.
From what i understand, the last batch doesn't has the same shape has the previous batches. So it crashes. I've tried to specify a batch_input_shape.
model.add(Dense(1, activation=activation, batch_input_shape=(None, 100, 100, 3)))
I've found here that I should put None to not specify the number of elements in the batch so it can be dynamic. But no success.
Edit: From the comment I had 2 mistakes:
The output shape was bad. I missed the flatten layer in the model.
The previous link does work with the correction of the flatten layer
Missing some code, I actually feed the fit_generator with a tf.data.Dataset.from_generator but I gave here a image_generator.flow_from_directory.
Here is the final code:
train_data_gen = image_generator.flow_from_directory(directory="path\\to\\dataset",
batch_size=1000,
shuffle=True,
target_size=(100, 100),
classes=list(CLASS_NAMES))
train_dataset = tf.data.Dataset.from_generator(
lambda: train_data_gen,
output_types=(tf.float32, tf.float32),
output_shapes=([None, x, y, 3],
[None, len(CLASS_NAMES)]))
model = tf.keras.Sequential()
model.add(Flatten(batch_input_shape=(None, 100, 100, 3)))
model.add(Dense(1, activation=activation))
model.add(Dense(2))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['categorical_accuracy'])
STEPS_PER_EPOCH = np.ceil(image_count / batch_size) # 629 / 100
model.fit_generator(generator=train_data_gen , steps_per_epoch=STEPS_PER_EPOCH, epochs=2, verbose=1)
For the benefit of community here i am explaining, how to use image_generator in Tensorflow with input_shape (100, 100, 3) using dogs vs cats dataset
If we haven't choose right batch size there is a chance of model struck right after first epoch, hence i am starting my explanation with how to choose batch_size ?
We generally observe that batch size to be the power of 2, this is because of the effective work of optimized matrix operation libraries. This is further elaborated in this research paper.
Check out this blog which describes how to choose the right batch size while comparing the effects of different batch sizes on the accuracy of CIFAR-10 dataset.
Here is the end to end working code with outputs
import os
import numpy as np
from keras import layers
import pandas as pd
from tensorflow.keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from tensorflow.keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import keras.backend as K
K.set_image_data_format('channels_last')
train_dir = '/content/drive/My Drive/Dogs_Vs_Cats/train'
test_dir = '/content/drive/My Drive/Dogs_Vs_Cats/test'
img_width, img_height = 100, 100
input_shape = img_width, img_height, 3
train_samples = 2000
test_samples = 1000
epochs = 30
batch_size = 32
train_datagen = ImageDataGenerator(
rescale = 1. /255,
shear_range = 0.2,
zoom_range = 0.2,
horizontal_flip = True)
test_datagen = ImageDataGenerator(
rescale = 1. /255)
train_data = train_datagen.flow_from_directory(
train_dir,
target_size = (img_width, img_height),
batch_size = batch_size,
class_mode = 'binary')
test_data = test_datagen.flow_from_directory(
test_dir,
target_size = (img_width, img_height),
batch_size = batch_size,
class_mode = 'binary')
model = Sequential()
model.add(Conv2D(32, (7, 7), strides = (1, 1), input_shape = input_shape))
model.add(BatchNormalization(axis = 3))
model.add(Activation('relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (7, 7), strides = (1, 1)))
model.add(BatchNormalization(axis = 3))
model.add(Activation('relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss = 'binary_crossentropy',
optimizer = 'rmsprop',
metrics = ['accuracy'])
model.fit_generator(
train_data,
steps_per_epoch = train_samples//batch_size,
epochs = epochs,
validation_data = test_data,
verbose = 1,
validation_steps = test_samples//batch_size)
Output:
Found 2000 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.
Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_8 (Conv2D) (None, 94, 94, 32) 4736
_________________________________________________________________
batch_normalization_8 (Batch (None, 94, 94, 32) 128
_________________________________________________________________
activation_8 (Activation) (None, 94, 94, 32) 0
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 47, 47, 32) 0
_________________________________________________________________
conv2d_9 (Conv2D) (None, 41, 41, 64) 100416
_________________________________________________________________
batch_normalization_9 (Batch (None, 41, 41, 64) 256
_________________________________________________________________
activation_9 (Activation) (None, 41, 41, 64) 0
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 20, 20, 64) 0
_________________________________________________________________
flatten_4 (Flatten) (None, 25600) 0
_________________________________________________________________
dense_11 (Dense) (None, 64) 1638464
_________________________________________________________________
dropout_4 (Dropout) (None, 64) 0
_________________________________________________________________
dense_12 (Dense) (None, 1) 65
=================================================================
Total params: 1,744,065
Trainable params: 1,743,873
Non-trainable params: 192
_________________________________________________________________
Epoch 1/30
62/62 [==============================] - 14s 225ms/step - loss: 1.8307 - accuracy: 0.4853 - val_loss: 0.6931 - val_accuracy: 0.5000
Epoch 2/30
62/62 [==============================] - 14s 226ms/step - loss: 0.7085 - accuracy: 0.4832 - val_loss: 0.6931 - val_accuracy: 0.5010
Epoch 3/30
62/62 [==============================] - 14s 218ms/step - loss: 0.6955 - accuracy: 0.5300 - val_loss: 0.6894 - val_accuracy: 0.5292
Epoch 4/30
62/62 [==============================] - 14s 221ms/step - loss: 0.6938 - accuracy: 0.5407 - val_loss: 0.7309 - val_accuracy: 0.5262
Epoch 5/30
62/62 [==============================] - 14s 218ms/step - loss: 0.6860 - accuracy: 0.5498 - val_loss: 0.6776 - val_accuracy: 0.5665
Epoch 6/30
62/62 [==============================] - 13s 216ms/step - loss: 0.7027 - accuracy: 0.5407 - val_loss: 0.6895 - val_accuracy: 0.5101
Epoch 7/30
62/62 [==============================] - 13s 216ms/step - loss: 0.6852 - accuracy: 0.5528 - val_loss: 0.6567 - val_accuracy: 0.5887
Epoch 8/30
62/62 [==============================] - 13s 217ms/step - loss: 0.6772 - accuracy: 0.5427 - val_loss: 0.6643 - val_accuracy: 0.5847
Epoch 9/30
62/62 [==============================] - 13s 217ms/step - loss: 0.6709 - accuracy: 0.5534 - val_loss: 0.6623 - val_accuracy: 0.5887
Epoch 10/30
62/62 [==============================] - 14s 219ms/step - loss: 0.6579 - accuracy: 0.5711 - val_loss: 0.6614 - val_accuracy: 0.6058
Epoch 11/30
62/62 [==============================] - 13s 218ms/step - loss: 0.6591 - accuracy: 0.5625 - val_loss: 0.6594 - val_accuracy: 0.5454
Epoch 12/30
62/62 [==============================] - 13s 216ms/step - loss: 0.6419 - accuracy: 0.5767 - val_loss: 1.1041 - val_accuracy: 0.5161
Epoch 13/30
62/62 [==============================] - 13s 215ms/step - loss: 0.6479 - accuracy: 0.5783 - val_loss: 0.6441 - val_accuracy: 0.5837
Epoch 14/30
62/62 [==============================] - 13s 216ms/step - loss: 0.6373 - accuracy: 0.5899 - val_loss: 0.6427 - val_accuracy: 0.6310
Epoch 15/30
62/62 [==============================] - 13s 215ms/step - loss: 0.6203 - accuracy: 0.6133 - val_loss: 0.7390 - val_accuracy: 0.6220
Epoch 16/30
62/62 [==============================] - 13s 217ms/step - loss: 0.6277 - accuracy: 0.6362 - val_loss: 0.6649 - val_accuracy: 0.5786
Epoch 17/30
62/62 [==============================] - 13s 215ms/step - loss: 0.6155 - accuracy: 0.6316 - val_loss: 0.9823 - val_accuracy: 0.5484
Epoch 18/30
62/62 [==============================] - 14s 222ms/step - loss: 0.6056 - accuracy: 0.6408 - val_loss: 0.6333 - val_accuracy: 0.6048
Epoch 19/30
62/62 [==============================] - 14s 218ms/step - loss: 0.6025 - accuracy: 0.6529 - val_loss: 0.6514 - val_accuracy: 0.6442
Epoch 20/30
62/62 [==============================] - 13s 215ms/step - loss: 0.6149 - accuracy: 0.6423 - val_loss: 0.6373 - val_accuracy: 0.6048
Epoch 21/30
62/62 [==============================] - 13s 215ms/step - loss: 0.6030 - accuracy: 0.6519 - val_loss: 0.6086 - val_accuracy: 0.6573
Epoch 22/30
62/62 [==============================] - 13s 217ms/step - loss: 0.5936 - accuracy: 0.6865 - val_loss: 1.0677 - val_accuracy: 0.5605
Epoch 23/30
62/62 [==============================] - 13s 214ms/step - loss: 0.5964 - accuracy: 0.6728 - val_loss: 0.7927 - val_accuracy: 0.5877
Epoch 24/30
62/62 [==============================] - 13s 215ms/step - loss: 0.5866 - accuracy: 0.6707 - val_loss: 0.6116 - val_accuracy: 0.6421
Epoch 25/30
62/62 [==============================] - 13s 214ms/step - loss: 0.5933 - accuracy: 0.6662 - val_loss: 0.8282 - val_accuracy: 0.6048
Epoch 26/30
62/62 [==============================] - 13s 214ms/step - loss: 0.5705 - accuracy: 0.6885 - val_loss: 0.5806 - val_accuracy: 0.6966
Epoch 27/30
62/62 [==============================] - 14s 218ms/step - loss: 0.5709 - accuracy: 0.7017 - val_loss: 1.2404 - val_accuracy: 0.5333
Epoch 28/30
62/62 [==============================] - 13s 216ms/step - loss: 0.5691 - accuracy: 0.7104 - val_loss: 0.6136 - val_accuracy: 0.6442
Epoch 29/30
62/62 [==============================] - 13s 215ms/step - loss: 0.5627 - accuracy: 0.7048 - val_loss: 0.6936 - val_accuracy: 0.6613
Epoch 30/30
62/62 [==============================] - 13s 214ms/step - loss: 0.5714 - accuracy: 0.6941 - val_loss: 0.5872 - val_accuracy: 0.6825