TensorFlow Keras(v2.2) model fit with multiple outputs and losses failed - tensorflow

I want to use TensorFlow Keras(v2.2) model fit in mnist with multiple outputs and losses, but it failed.
My costume model will return a list [logits, embedding]. logits is 2D tensor [batch , 10] and embedding is also 2D tensor [batch, 64].
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__()
self.reshape = tf.keras.layers.Reshape((28, 28, 1))
self.conv2D1 = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), strides=(1, 1), padding='same', activation='relu')
self.maxPool1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding="same")
self.conv2D2 = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), strides=(1, 1), padding='same', activation='relu')
self.maxPool2 = tf.keras.layers.MaxPooling2D(pool_size=2)
self.flatten = tf.keras.layers.Flatten(data_format="channels_last")
self.dropout = tf.keras.layers.Dropout(tf.compat.v1.placeholder_with_default(0.25, shape=[], name="dropout"))
self.dense1 = tf.keras.layers.Dense(64, activation=None)
self.dense2 = tf.keras.layers.Dense(10, activation=None)
def call(self, inputs, training):
x = self.reshape(inputs)
x = self.conv2D1(x)
x = self.maxPool1(x)
if training:
x = self.dropout(x)
x = self.conv2D2(x)
x = self.maxPool2(x)
if training:
x = self.dropout(x)
x = self.flatten(x)
x = self.dense1(x)
embedding = tf.math.l2_normalize(x, axis=1)
logits = self.dense2(embedding)
return [logits, embedding]
loss_0 is normal cross_entropy
def loss_0(y_true, y_pred):
loss_0 = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred[0]))
loss_1 is triplet_semihard_loss
def loss_1(y_true, y_pred):
loss_1 = tfa.losses.triplet_semihard_loss(y_true=y_true, y_pred=y_pred[1], distance_metric="L2")
return loss_1
When I use model fit, I can only get logits tensor in each loss. I can't get embedding tensor. y_pred[0] and y_pred[1] is not work. Any suggestion?
model = MyModel()
model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3), loss=[loss_0, loss_1], loss_weights=[0.1, 0.1])
history = model.fit(train_dataset, epochs=5)

Related

Keras model.fit does not complain incompatible shape?

I defined my model starting with inputs = tf.keras.Input(shape=(512, 512, 3), batch_size=BATCH_SIZE). Then I use model.fit with data of shape (1, 720, 1280, 3). The model still trains normally and the loss decreases. Why is that?
Thanks.
I tried to train with same shape. But the result did not turns out to be good.
So here is the code:
inputs = tf.keras.Input(shape=(512, 512, 3), batch_size=BATCH_SIZE) # REVISE,
x = tf.keras.layers.Cropping2D(cropping=((256, 0), (0, 0)))(inputs)
x = tf.keras.layers.Resizing(66,200)(x)
# x = x / 255
x = tf.keras.layers.Conv2D(24, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(36, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(48, (5,5), (2,2), activation="elu")(x)
x = tf.keras.layers.Conv2D(64, (3,3), activation="elu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Conv2D(64, (3,3), activation="elu")(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(100, activation='elu')(x)
x = tf.keras.layers.Dense(50, activation='elu')(x)
x = tf.keras.layers.Dense(10, activation='elu')(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
optimizer = Adam(learning_rate=1e-3)
model.compile(loss='mse', optimizer=optimizer)
img_gen = image_data_generator(X_train, y_train, batch_size=1)
X = next(img_gen)[0]
print(X.shape)
print(model(X).shape)
# saves the model weights after each epoch if the validation loss decreased
checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath=os.path.join(model_output_dir,'lane_navigation_new_nvmodel_big2'), verbose=1, save_best_only=True) # revise
with tf.device('/device:GPU:0'):
history = model.fit(image_data_generator( X_train, y_train, batch_size=BATCH_SIZE), # 重要!batch_size大了好训练,1个batch_size训不出东西!
steps_per_epoch=10, # 300
epochs=300,
validation_data = image_data_generator( X_valid, y_valid, batch_size=BATCH_SIZE),
validation_steps=10, # 200
verbose=1,
shuffle=1,
callbacks=[checkpoint_callback])

Tensorflow ModelCheckpoint not saving model, no loss after reloading

The callback is saving checkpoint files, but not the SavedModel model.pb file. Additionally, when I load the model from the checkpoints it does not reload 'val_loss' which I'm conditioning "save_best_model" on.
I tried using a model.save() only on the best iteration but was having trouble with getting that to work correctly and it would be more convenient to use the ModelCheckpoint callback.
Here is the relevant code
LOSS = tf.keras.losses.MeanSquaredError(),
#multi output 3 categories from 0 to 1
model = ImgToClassSimpleContinuous(img_height, img_width)
checkpoint_filename = "../chkpts/ImgToClassSimpleContinuous/checkpoint_dir"
model.load_weights(checkpoint_filename)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filename,
verbose=1,mode='min', monitor="val_loss", save_best_only=True, save_weights_only=False)
model.compile(
optimizer='adam',
loss = [LOSS, LOSS, LOSS],
metrics=['mse'])
model.fit(
dataset_to_use,
validation_data = dataset_validation_batched,
# validation_steps=50,
epochs=MAX_EPOCHS,
batch_size=BATCH_SIZE,
callbacks=[cp_callback]
)
class ImgToClassSimpleContinuous(Model):
'''
pair with loss = categorical_crossentropy
'''
in_types = [DataType.d]
out_types = [DataType.tlc, DataType.tls, DataType.tll]
def __init__(self, img_height, img_width, *args, **kwargs):
super().__init__(ImgToClassSimple, *args, **kwargs)
initializer = 'he_normal'
input_shape = (img_height, img_width, 1)
inputs = tf.keras.Input(shape=input_shape)
flat_pix = layers.Flatten()(inputs)
x = layers.Conv2D(8, 3, padding='same', kernel_initializer=initializer)(inputs)
x = layers.PReLU()(x)
x = layers.Conv2D(8, 3, padding='same', kernel_initializer=initializer)(x)
x = layers.PReLU()(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(16, 3, padding='same', kernel_initializer=initializer)(x)
x = layers.PReLU()(x)
x = layers.Conv2D(16, 3, padding='same', kernel_initializer=initializer)(x)
x = layers.PReLU()(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.BatchNormalization()(x)
t = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(x)
t = layers.PReLU()(t)
t = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(t)
t = layers.PReLU()(t)
t = layers.MaxPooling2D(pool_size=(2, 2))(t)
t = layers.BatchNormalization()(t)
t = tf.keras.layers.GlobalAveragePooling2D()(t)
t = layers.Flatten()(t)
s = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(x)
s = layers.PReLU()(s)
s = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(s)
s = layers.PReLU()(s)
s = layers.MaxPooling2D(pool_size=(2, 2))(s)
s = layers.BatchNormalization()(s)
s = tf.keras.layers.GlobalAveragePooling2D()(s)
s = layers.Flatten()(s)
l = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(x)
l = layers.PReLU()(l)
l = layers.Conv2D(32, 3, padding='same', kernel_initializer=initializer)(l)
l = layers.PReLU()(l)
l = layers.MaxPooling2D(pool_size=(2, 2))(l)
l = layers.BatchNormalization()(l)
l = tf.keras.layers.GlobalAveragePooling2D()(l)
l = layers.Flatten()(l)
t = layers.Dense(1, activation='sigmoid')(t)
s = layers.Dense(1, activation='sigmoid')(s)
l = layers.Dense(1, activation='sigmoid')(l)
# A Dense classifier with a single unit (binary classification)
self.model = tf.keras.Model(inputs, [t, s, l])
tf.keras.utils.plot_model(self.model, to_file="...", show_shapes=True)
def call(self, x):
return self.model(x)

TF2 code 10 times slower than equivalent PyTorch code for a Conv1D network

I've been trying to translate some PyTorch code to TensorFlow 2, but the TF2 code is around 10 times slower. I've tried looking at where this might come from, and as far as I can tell it comes from the tape.gradient call (performance was the same with keras' .fit function). I've tried to use different data loaders, ways of declaring the model, installations, etc... and the results have been consistent.
Any explanation / solution as to why this is happening would be much appreciated.
Here is a minimalist version of the TF2 code:
import time
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 120, 18, 1)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.batch(256)
# Create a small model
model = tf.keras.Sequential([
layers.Conv1D(64, kernel_size=7, strides=3, padding="same", activation="relu"),
layers.Conv1D(64, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(256, kernel_size=1, strides=1, padding="same", activation="relu"),
layers.GlobalAveragePooling2D(),
layers.Flatten(),
layers.Dense(128, use_bias=True, activation="relu"),
layers.Dense(32, use_bias=True, activation="relu"),
layers.Dense(1, activation='sigmoid', use_bias=True),
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=5e-4)
#tf.function
def train_step(data_batch, label_batch):
with tf.GradientTape() as tape:
y_pred = model(data_batch)
loss = tf.keras.losses.MSE(labels_batch, y_pred)
gradients = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(gradients, model.trainable_weights))
step_times = []
for epoch in range(20):
for data_batch, labels_batch in train_dataset:
step_start_time = time.perf_counter()
train_step(data_batch, labels_batch)
if epoch != 0:
step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")
And the PyTorch equivalent:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 18, 120)
# Create a small model
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv1d(18, 64, kernel_size=7, stride=3, padding=3)
self.conv2 = nn.Conv1d(64, 64, kernel_size=5, stride=2, padding=2)
self.conv3 = nn.Conv1d(64, 128, kernel_size=5, stride=2, padding=2)
self.conv4 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
self.conv5 = nn.Conv1d(128, 128, kernel_size=3, stride=1, padding=1)
self.conv6 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)
self.fc1 = nn.Linear(256, 128)
self.fc2 = nn.Linear(128, 32)
self.fc3 = nn.Linear(32, 1)
def forward(self, inputs):
x = F.relu(self.conv1(inputs))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = F.relu(self.conv5(x))
x = F.relu(self.conv6(x))
x = x.mean(2)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = torch.sigmoid(self.fc3(x))
return x
model = Model()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
loss_fn = torch.nn.MSELoss()
batch_size = 256
train_steps_per_epoch = train_data.shape[0] // batch_size
step_times = []
for epoch in range(20):
for step in range(train_steps_per_epoch):
batch_start, batch_end = step * batch_size, (step+1) * batch_size
data_batch = torch.FloatTensor(train_data[batch_start:batch_end]).to(device)
labels_batch = torch.FloatTensor(train_labels[batch_start:batch_end]).to(device)
step_start_time = time.perf_counter()
optimizer.zero_grad()
y_pred = model(data_batch)
loss = loss_fn(labels_batch, torch.squeeze(y_pred))
loss.backward()
optimizer.step()
if epoch != 0:
step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")
You're using tf.GradientTape correctly, but both your models and data are different in the snippets you provided.
Here is the TF code that uses the same data and model architecture as your Pytorch model.
import time
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
# Generate some fake data
train_labels = np.random.randint(10, size=1000)
train_data = np.random.rand(1000, 120, 18)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.batch(256)
model = tf.keras.Sequential([
layers.Conv1D(64, kernel_size=7, strides=3, padding="same", activation="relu"),
layers.Conv1D(64, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=5, strides=2, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(128, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.Conv1D(256, kernel_size=3, strides=1, padding="same", activation="relu"),
layers.GlobalAveragePooling1D(),
layers.Dense(128, use_bias=True, activation="relu"),
layers.Dense(32, use_bias=True, activation="relu"),
layers.Dense(1, activation='sigmoid', use_bias=True),
])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3, decay=5e-4)
#tf.function
def train_step(data_batch, label_batch, model):
with tf.GradientTape() as tape:
y_pred = model(data_batch, training=True)
loss = tf.keras.losses.MSE(labels_batch, y_pred)
gradients = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(gradients, model.trainable_weights))
step_times = []
for epoch in range(20):
for data_batch, labels_batch in train_dataset:
step_start_time = time.perf_counter()
train_step(data_batch, labels_batch, model)
if epoch != 0:
step_times.append(time.perf_counter()-step_start_time)
print(f"Average training step time: {np.mean(step_times):.3f}s.")
So, in reality, TF is 3 times faster than Pytorch: 0.035s vs 0.112s

ValueError: Shape (None, 17) must have rank 1

I am working on a hand character recognition model. I created a CNN+BiLSTM+CTC Loss model. But getting error when I run model.fit(). Please help me fix this error.
My Model
# input with shape of height=32 and width=128
inputs = Input(shape=(32,128,1))
# convolution layer with kernel size (3,3)
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
# poolig layer with kernel size (2,2)
pool_1 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv_1)
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPooling2D(pool_size=(2, 2), strides=2)(conv_2)
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
# poolig layer with kernel size (2,1)
pool_4 = MaxPooling2D(pool_size=(2, 1))(conv_4)
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
# Batch normalization layer
batch_norm_5 = BatchNormalization()(conv_5)
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPooling2D(pool_size=(2, 1))(batch_norm_6)
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)
squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)
# bidirectional LSTM layers with units=128
blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)
outputs = Dense(len(char_dict)+1, activation = 'softmax')(blstm_2)
act_model = Model(inputs, outputs)
Define a CTC loss model that takes the outputs of previous model as inputs
labels = Input(name='the_labels', shape=[max_length], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length,
label_length])
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
model.fit(x=[input_array,
output_array,
train_input_length,
train_label_length],
y=np.zeros(input_array.shape[0]),
batch_size=256,
epochs = 100,
validation_data = ([test_input_array, test_output_array, valid_input_length,
valid_label_length], [np.zeros(test_input_array.shape[0])]),
verbose = 1,
callbacks = callbacks_list)
The error I am getting is
ValueError: Shape (None, 17) must have rank 1

Why is the reduce_mean applied to the output of sparse_softmax_cross_entropy_with_logits?

There are several tutorials that applied reduce_mean to the output of sparse_softmax_cross_entropy_with_logits. For example
cross_entropy = -tf.reduce_sum(y_ * tf.log(y_conv))
or
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv))
Why is the reduce_mean applied to the output of sparse_softmax_cross_entropy_with_logits? Is it because we are using mini-batches, and so we want to calculate (using reduce_mean) the average loss over all samples of the mini-batch?
The reason is to get the average loss over the batch.
Generally you will train a neural network with input batches of size > 1, each element in the batch will produce a loss value so the easiest way to merge these into one value is to average.
I find something interesting~
first, let define sparse_vector as
sparse_vector = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv)
the sparse_vector is a vector, and we should calculate the summery of it, that why we should use the reduce_mean.
import numpy as np
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=False)
print(mnist.test.labels.shape)
print(mnist.train.labels.shape)
with tf.name_scope('inputs'):
X_ = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.int64, [None])
X = tf.reshape(X_, [-1, 28, 28, 1])
h_conv1 = tf.layers.conv2d(X, filters=32, kernel_size=5, strides=1,
padding='same', activation=tf.nn.relu, name='conv1')
h_pool1 = tf.layers.max_pooling2d(h_conv1, pool_size=2, strides=2,
padding='same', name='pool1')
h_conv2 = tf.layers.conv2d(h_pool1, filters=64, kernel_size=5, strides=1,
padding='same',activation=tf.nn.relu, name='conv2')
h_pool2 = tf.layers.max_pooling2d(h_conv2, pool_size=2, strides=2,
padding='same', name='pool2')
# flatten
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.layers.dense(h_pool2_flat, 1024, name='fc1', activation=tf.nn.relu)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, 0.5)
h_fc2 = tf.layers.dense(h_fc1_drop, units=10, name='fc2')
# y_conv = tf.nn.softmax(h_fc2)
y_conv = h_fc2
# print('Finished building network.')
# cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv))
sparse_vector = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.cast(y_, dtype=tf.int32), logits=y_conv)
cross_entropy = tf.reduce_mean(sparse_vector)
sess.run(tf.global_variables_initializer())
# print(sparse_vector)
# print(cross_entropy)
# Tensor("SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits:0", shape=(?,), dtype=float32)
# Tensor("Mean:0", shape=(), dtype=float32)
batch = mnist.train.next_batch(10)
sparse_vector,cross_entropy = sess.run(
[sparse_vector,cross_entropy],
feed_dict={X_: batch[0], y_: batch[1]})
print(sparse_vector)
print(cross_entropy)
the output is
[2.2213464 2.2676413 2.3555744 2.3196406 2.0794516 2.394274 2.266591
2.3139718 2.345526 2.3952296]
2.2959247