Tensorflow ModelCheckpoint not saving model, no loss after reloading - tensorflow

The callback is saving checkpoint files, but not the SavedModel model.pb file. Additionally, when I load the model from the checkpoints it does not reload 'val_loss' which I'm conditioning "save_best_model" on.
I tried using a model.save() only on the best iteration but was having trouble with getting that to work correctly and it would be more convenient to use the ModelCheckpoint callback.
Here is the relevant code
LOSS = tf.keras.losses.MeanSquaredError(),
#multi output 3 categories from 0 to 1
model = ImgToClassSimpleContinuous(img_height, img_width)
checkpoint_filename = "../chkpts/ImgToClassSimpleContinuous/checkpoint_dir"
model.load_weights(checkpoint_filename)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filename,
verbose=1,mode='min', monitor="val_loss", save_best_only=True, save_weights_only=False)
model.compile(
optimizer='adam',
loss = [LOSS, LOSS, LOSS],
metrics=['mse'])
model.fit(
dataset_to_use,
validation_data = dataset_validation_batched,
# validation_steps=50,
epochs=MAX_EPOCHS,
batch_size=BATCH_SIZE,
callbacks=[cp_callback]
)
class ImgToClassSimpleContinuous(Model):
'''
pair with loss = categorical_crossentropy
'''
in_types = [DataType.d]
out_types = [DataType.tlc, DataType.tls, DataType.tll]
def __init__(self, img_height, img_width, *args, **kwargs):
super().__init__(ImgToClassSimple, *args, **kwargs)
initializer = 'he_normal'
input_shape = (img_height, img_width, 1)
inputs = tf.keras.Input(shape=input_shape)
flat_pix = layers.Flatten()(inputs)
x = layers.Conv2D(8, 3, padding='same', kernel_initializer=initializer)(inputs)
x = layers.PReLU()(x)
x = layers.Conv2D(8, 3, padding='same', kernel_initializer=initializer)(x)
x = layers.PReLU()(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(16, 3, padding='same', kernel_initializer=initializer)(x)
x = layers.PReLU()(x)
x = layers.Conv2D(16, 3, padding='same', kernel_initializer=initializer)(x)
x = layers.PReLU()(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.BatchNormalization()(x)
t = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(x)
t = layers.PReLU()(t)
t = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(t)
t = layers.PReLU()(t)
t = layers.MaxPooling2D(pool_size=(2, 2))(t)
t = layers.BatchNormalization()(t)
t = tf.keras.layers.GlobalAveragePooling2D()(t)
t = layers.Flatten()(t)
s = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(x)
s = layers.PReLU()(s)
s = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(s)
s = layers.PReLU()(s)
s = layers.MaxPooling2D(pool_size=(2, 2))(s)
s = layers.BatchNormalization()(s)
s = tf.keras.layers.GlobalAveragePooling2D()(s)
s = layers.Flatten()(s)
l = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(x)
l = layers.PReLU()(l)
l = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(l)
l = layers.PReLU()(l)
l = layers.MaxPooling2D(pool_size=(2, 2))(l)
l = layers.BatchNormalization()(l)
l = tf.keras.layers.GlobalAveragePooling2D()(l)
l = layers.Flatten()(l)
t = layers.Dense(1, activation='sigmoid')(t)
s = layers.Dense(1, activation='sigmoid')(s)
l = layers.Dense(1, activation='sigmoid')(l)
# A Dense classifier with a single unit (binary classification)
self.model = tf.keras.Model(inputs, [t, s, l])
tf.keras.utils.plot_model(self.model, to_file="...", show_shapes=True)
def call(self, x):
return self.model(x)

Related

Keras model.fit does not complain incompatible shape?

I defined my model starting with inputs = tf.keras.Input(shape=(512, 512, 3), batch_size=BATCH_SIZE). Then I use model.fit with data of shape (1, 720, 1280, 3). The model still trains normally and the loss decreases. Why is that?
Thanks.
I tried to train with same shape. But the result did not turns out to be good.
So here is the code:
inputs = tf.keras.Input(shape=(512, 512, 3), batch_size=BATCH_SIZE) # REVISE,
x = tf.keras.layers.Cropping2D(cropping=((256, 0), (0, 0)))(inputs)
x = tf.keras.layers.Resizing(66,200)(x)
# x = x / 255
x = tf.keras.layers.Conv2D(24, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(36, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(48, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(64, (3,3), activation="elu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Conv2D(64, (3,3), activation="elu")(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(100, activation='elu')(x)
x = tf.keras.layers.Dense(50, activation='elu')(x)
x = tf.keras.layers.Dense(10, activation='elu')(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
optimizer = Adam(learning_rate=1e-3)
model.compile(loss='mse', optimizer=optimizer)
img_gen = image_data_generator(X_train, y_train, batch_size=1)
X = next(img_gen)[0]
print(X.shape)
print(model(X).shape)
# saves the model weights after each epoch if the validation loss decreased
checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath=os.path.join(model_output_dir,'lane_navigation_new_nvmodel_big2'), verbose=1, save_best_only=True) # revise
with tf.device('/device:GPU:0'):
history = model.fit(image_data_generator( X_train, y_train, batch_size=BATCH_SIZE), # 重要!batch_size大了好训练,1个batch_size训不出东西!
steps_per_epoch=10, # 300
epochs=300,
validation_data = image_data_generator( X_valid, y_valid, batch_size=BATCH_SIZE),
validation_steps=10, # 200
verbose=1,
shuffle=1,
callbacks=[checkpoint_callback])

AssertionError: Could not compute output Tensor("dense/Sigmoid:0", shape=(None, 1), dtype=float32)

I am trying to create a multi branch CNN-LSTM model. I have created 4 models and concatenate it. I am getting the assertion error. Here is my code.
top_words= 50
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# Pad all sequences
padded_inputs = pad_sequences(X_train, maxlen=top_words, value = 0.0)
padded_inputs_test = pad_sequences(X_test, maxlen=top_words, value = 0.0)
# create the model
embedding_vecor_length = 32
model1_input = keras.Input(shape= 250)
model2_input = keras.Input(shape= 250)
model3_input = keras.Input(shape= 250)
model4_input = keras.Input(shape= 250)
embedding_layer = Embedding(top_words, embedding_vecor_length)
encoded_model1_input = embedding_layer(model1_input)
encoded_model2_input = embedding_layer(model2_input)
encoded_model3_input = embedding_layer(model3_input)
encoded_model4_input = embedding_layer(model4_input)
model1 = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu',
kernel_regularizer=regularizers.l2(0.01) )(encoded_model1_input)
model1 = MaxPooling1D(pool_size=2)(model1)
model1 = keras.layers.Dropout(0.4)(model1)
model1 = keras.layers.BatchNormalization()(model1)
model1 = Bidirectional(LSTM(128))(model1)
model2 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu',
kernel_regularizer=regularizers.l2(0.01) )(encoded_model2_input)
model2 = MaxPooling1D(pool_size=2)(model2)
model2 = keras.layers.Dropout(0.4)(model2)
model2 = keras.layers.BatchNormalization()(model2)
model2 = Bidirectional(LSTM(128))(model2)
model3 = Conv1D(filters=32, kernel_size=7, padding='same', activation='relu',
kernel_regularizer=regularizers.l2(0.01) )(encoded_model3_input)
model3 = MaxPooling1D(pool_size=2)(model3)
model3 = keras.layers.Dropout(0.4)(model3)
model3 = keras.layers.BatchNormalization()(model3)
model3 = Bidirectional(LSTM(128))(model3)
model4 = Conv1D(filters=32, kernel_size=9, padding='same', activation='relu',
kernel_regularizer=regularizers.l2(0.01) )(encoded_model3_input)
model4 = MaxPooling1D(pool_size=2)(model4)
model4 = keras.layers.Dropout(0.4)(model4)
model4 = keras.layers.BatchNormalization()(model4)
model4 = Bidirectional(LSTM(128))(model4)
abc = keras.layers.concatenate([model1, model2, model3, model4])
out = Dense(1, activation='sigmoid')(abc)
model = keras.Model(inputs=[model1_input, model2_input, model3_input,model4_input], outputs=[out])
rmsprop= optimizers.RMSprop(lr=0.01, decay=0.1)
model.compile(loss='binary_crossentropy', optimizer= rmsprop, metrics=['accuracy'])
model.summary()
model.fit(padded_inputs, y_train, epochs=15, verbose=True, validation_split= 0.2)
Model Architecture
What's wrong with this code. I am not getting exactly what I am doing wrong. Am I doing something wrong with model.fit() Please help

ValueError: Shape (None, 17) must have rank 1

I am working on a hand character recognition model. I created a CNN+BiLSTM+CTC Loss model. But getting error when I run model.fit(). Please help me fix this error.
My Model
# input with shape of height=32 and width=128
inputs = Input(shape=(32,128,1))
# convolution layer with kernel size (3,3)
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
# poolig layer with kernel size (2,2)
pool_1 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv_1)
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv_2)
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
# poolig layer with kernel size (2,1)
pool_4 = MaxPooling2D(pool_size=(2, 1))(conv_4)
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
# Batch normalization layer
batch_norm_5 = BatchNormalization()(conv_5)
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPooling2D(pool_size=(2, 1))(batch_norm_6)
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)
# bidirectional LSTM layers with units=128
blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)
outputs = Dense(len(char_dict)+1, activation = 'softmax')(blstm_2)
act_model = Model(inputs, outputs)
Define a CTC loss model that takes the outputs of previous model as inputs
labels = Input(name='the_labels', shape=[max_length], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length,
label_length])
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
model.fit(x=[input_array,
output_array,
train_input_length,
train_label_length],
y=np.zeros(input_array.shape[0]),
batch_size=256,
epochs = 100,
validation_data = ([test_input_array, test_output_array, valid_input_length,
valid_label_length], [np.zeros(test_input_array.shape[0])]),
verbose = 1,
callbacks = callbacks_list)
The error I am getting is
ValueError: Shape (None, 17) must have rank 1

TensorFlow Keras(v2.2) model fit with multiple outputs and losses failed

I want to use TensorFlow Keras(v2.2) model fit in mnist with multiple outputs and losses, but it failed.
My costume model will return a list [logits, embedding]. logits is 2D tensor [batch , 10] and embedding is also 2D tensor [batch, 64].
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__()
self.reshape = tf.keras.layers.Reshape((28, 28, 1))
self.conv2D1 = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), strides=(1, 1), padding='same', activation='relu')
self.maxPool1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding="same")
self.conv2D2 = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), strides=(1, 1), padding='same', activation='relu')
self.maxPool2 = tf.keras.layers.MaxPooling2D(pool_size=2)
self.flatten = tf.keras.layers.Flatten(data_format="channels_last")
self.dropout = tf.keras.layers.Dropout(tf.compat.v1.placeholder_with_default(0.25, shape=[], name="dropout"))
self.dense1 = tf.keras.layers.Dense(64, activation=None)
self.dense2 = tf.keras.layers.Dense(10, activation=None)
def call(self, inputs, training):
x = self.reshape(inputs)
x = self.conv2D1(x)
x = self.maxPool1(x)
if training:
x = self.dropout(x)
x = self.conv2D2(x)
x = self.maxPool2(x)
if training:
x = self.dropout(x)
x = self.flatten(x)
x = self.dense1(x)
embedding = tf.math.l2_normalize(x, axis=1)
logits = self.dense2(embedding)
return [logits, embedding]
loss_0 is normal cross_entropy
def loss_0(y_true, y_pred):
loss_0 = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred[0]))
loss_1 is triplet_semihard_loss
def loss_1(y_true, y_pred):
loss_1 = tfa.losses.triplet_semihard_loss(y_true=y_true, y_pred=y_pred[1], distance_metric="L2")
return loss_1
When I use model fit, I can only get logits tensor in each loss. I can't get embedding tensor. y_pred[0] and y_pred[1] is not work. Any suggestion?
model = MyModel()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3), loss=[loss_0, loss_1], loss_weights=[0.1, 0.1])
history = model.fit(train_dataset, epochs=5)

Distribute Tensor over Multiple GPUs

I am attempting to train a model in which the input exceeds the memory limits for a single GPU on the system (16 GB P100). The size of the input is (1,256,256,64,2). However, I have access to 4 identical GPUs on the system. I know I can distribute processes with tf.distribute but I am unsure how to do this with a batch size of 1. Is it possible to distribute a single sample over multiple GPUs so I don't receive OOM errors?
Edit:
Here is the code used to build the model.
def dice_loss(y_true, y_pred):
numerator = 2 * tf.reduce_sum(y_true * y_pred, axis=(1,2,3))
denominator = tf.reduce_sum(y_true + y_pred, axis=(1,2,3))
return tf.reshape(1 - numerator / denominator, (-1, 1, 1))
class ResidualUnitEncode(keras.layers.Layer):
def __init__(self, filters=1, strides=1, activation="relu", **kwargs):
super().__init__(**kwargs)
self.activation = keras.activations.get(activation)
self.main_layers = [
keras.layers.Conv3D(filters, (3, 3, 3), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization(),
self.activation,
keras.layers.Conv3D(filters, (3, 3, 3), strides=1,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
self.skip_layers = []
if strides > 1:
self.skip_layers = [
keras.layers.Conv3D(filters, (1, 1, 1), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
def call(self, inputs):
Z = inputs
for layer in self.main_layers:
Z = layer(Z)
skip_Z = inputs
for layer in self.skip_layers:
skip_Z = layer(skip_Z)
return self.activation(Z + skip_Z)
def get_config(self):
base_config = super(ResidualUnitEncode, self).get_config()
return base_config
class ResidualUnitDecode(keras.layers.Layer):
def __init__(self, filters=1, strides=1, activation="relu", **kwargs):
super().__init__(**kwargs)
self.activation = keras.activations.get(activation)
self.main_layers = [
keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=1,
padding="same", use_bias=False),
keras.layers.BatchNormalization(),
self.activation,
keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
self.skip_layers = []
if strides > 1:
self.skip_layers = [
keras.layers.Conv3DTranspose(filters, (3, 3, 3), strides=strides,
padding="same", use_bias=False),
keras.layers.BatchNormalization()]
def call(self, inputs):
Z = inputs
for layer in self.main_layers:
Z = layer(Z)
skip_Z = inputs
for layer in self.skip_layers:
skip_Z = layer(skip_Z)
return self.activation(Z + skip_Z)
def get_config(self):
base_config = super(ResidualUnitDecode, self).get_config()
return base_config
def build_unet(image_shape, batch_size):
inputs = keras.layers.Input(shape=image_shape, batch_size=batch_size)
conv1 = keras.layers.Conv3D(64, (7, 7, 7), strides=(2, 2, 1), padding="same", use_bias=False, input_shape=image_shape)(inputs)
conv1 = keras.layers.BatchNormalization()(conv1)
conv1 = keras.layers.Activation("relu")(conv1)
pool1 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv1)
conv2 = ResidualUnitEncode(filters=128, strides=2)(pool1)
pool2 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv2)
conv3 = ResidualUnitEncode(filters=256, strides=2)(pool2)
pool3 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv3)
conv4 = ResidualUnitEncode(filters=512, strides=2)(pool3)
pool4 = keras.layers.MaxPool3D(pool_size=(3, 3, 3), strides=1, padding="same")(conv4)
conv5 = ResidualUnitEncode(filters=1024, strides=2)(pool4)
drop5 = keras.layers.Dropout(0.5)(conv5)
up6 = ResidualUnitDecode(filters=512, strides=2)(drop5)
merge6 = keras.layers.concatenate([conv4, up6], axis=4)
conv6 = ResidualUnitEncode(filters=512, strides=2)(merge6)
conv6 = keras.layers.UpSampling3D(size=(2,2,2))(conv6)
up7 = ResidualUnitDecode(filters=256, strides=2)(conv6)
merge7 = keras.layers.concatenate([conv3, up7], axis=4)
conv7 = ResidualUnitEncode(filters=256, strides=2)(merge7)
conv7 = keras.layers.UpSampling3D(size=(2, 2, 2))(conv7)
up8 = ResidualUnitDecode(filters=128, strides=2)(conv7)
merge8 = keras.layers.concatenate([conv2, up8], axis=4)
conv8 = ResidualUnitEncode(filters=128, strides=2)(merge8)
conv8 = keras.layers.UpSampling3D(size=(2, 2, 2))(conv8)
up9 = ResidualUnitDecode(filters=64, strides=2)(conv8)
merge9 = keras.layers.concatenate([conv1, up9], axis=4)
conv9 = ResidualUnitDecode(filters=64, strides=2)(merge9)
conv10 = keras.layers.Conv3D(1,1, strides=(1,1,2),activation="sigmoid")(conv9)
model = keras.Model(inputs, conv10)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=dice_loss)
model.summary()
return model
Here is the code to run the training using Kfold CV:
image_shape = [256,256,64,2]
dataset = tf.data.TFRecordDataset('train.tfrecord').map(parse_record).batch(69)
nx = tf.compat.v1.data.make_one_shot_iterator(dataset)
x, y = nx.get_next()
x_test = x[55:69, ...]
y_test = y[55:69, ...]
x_train = x[0:54, ...]
y_train = y[0:54, ...]
kfold = KFold(n_splits=10, shuffle=True)
fold_no = 1
acc_per_fold = []
loss_per_fold = []
for train, test in kfold.split(x_train, y_train):
model = build_unet(image_shape=image_shape, batch_size=1)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss')
model_file_name = './Fold_' + str(fold_no) + '_best_model.h5'
model_checkpoint = keras.callbacks.ModelCheckpoint(model_file_name, monitor='val_loss')
log_dir_name = './Fold_' + str(fold_no) + '_log_dir'
tb = keras.callbacks.TensorBoard(log_dir_name)
print('------------------------------------------------------------------------')
print(f'Training for fold {fold_no} ...')
train_id_rows = tf.constant(train.reshape(-1,1))
test_id_rows = tf.constant(test.reshape(-1,1))
x_train_train = tf.gather_nd(x_train, train_id_rows)
y_train_train = tf.gather_nd(y_train, train_id_rows)
x_train_test = tf.gather_nd(x_train, test_id_rows)
y_train_test = tf.gather_nd(y_train, test_id_rows)
history = model.fit(x_train_train, y_train_train, epochs=N_EPOCHS, callbacks=[tb, model_checkpoint, early_stopping], batch_size=1)
scores = model.evaluate(x_train_test, y_train_test, verbose=0)
acc_per_fold.append(scores[1] * 100)
loss_per_fold.append(scores[0])
fold_no = fold_no + 1
There are 69 total samples in the dataset, 54 used for the training/validation loop.