Keras sequential model to Tensorflow EstimatorSpec accuracy decreases - tensorflow

Having some issues converting from a Keras (keras_model_fn) over to a TF model_fn for use in Sagemaker.
The models look like this:
Keras
def keras_model_fn(hyperparameters):
model = tf.keras.Sequential()
# increase input_dim (cur 2500) as amount of words go up
model.add(tf.keras.layers.InputLayer(input_shape=[8], name='main_input'))
model.add(tf.keras.layers.Embedding(2500, 128, input_length=8))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(NUM_CLASSES, activation='softmax'))
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['acc']
)
return model
Tensorflow
def model_fn(features, labels, mode, params):
input_layer = tf.keras.layers.InputLayer(
input_shape=(8,))(features[INPUT_TENSOR_NAME])
embedding_layer = tf.keras.layers.Embedding(
2500,
128,
input_length=8)(input_layer)
flattened = tf.keras.layers.Flatten()(embedding_layer)
predictions = tf.keras.layers.Dense(
NUM_CLASSES,
activation='softmax')(flattened)
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"output": predictions})
loss = tf.losses.softmax_cross_entropy(labels, predictions)
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.train.get_global_step(),
learning_rate=0.001,
optimizer="Adam")
predictions_dict = {"output": predictions}
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
tf.cast(labels,tf.int32), predictions)
}
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops
)
The training and eval data is identical. Feeding in an array of padded text sequences (length 8). With an expected output of 1/5 labels.
The Losses
I'm assuming the problem lies in the loss function. I can't quite figure out what the Sequential model is doing behind the scenes versus what my tensorflow model is doing.
In the Keras model, I'm getting the following loss.
INFO:tensorflow:global_step/sec: 170.783
INFO:tensorflow:loss = 0.0018957269, step = 1701 (0.586 sec)
INFO:tensorflow:global_step/sec: 164.419
INFO:tensorflow:loss = 0.029586311, step = 1801 (0.608 sec)
INFO:tensorflow:global_step/sec: 155.381
INFO:tensorflow:loss = 0.0019212833, step = 1901 (0.644 sec)
INFO:tensorflow:Loss for final step: 0.0023477676.
In the Converted model, I'm getting the following.
INFO:tensorflow:loss = 1.232958, step = 1701 (0.354 sec)
INFO:tensorflow:global_step/sec: 280.328
INFO:tensorflow:loss = 1.0923336, step = 1801 (0.357 sec)
INFO:tensorflow:global_step/sec: 291.823
INFO:tensorflow:loss = 1.4360821, step = 1901 (0.343 sec)
INFO:tensorflow:Loss for final step: 1.0532712.
As expected the accuracy on the Converted model (for the data it was trained on) hits around 60%. The accuracy for the Keras model is at 100%.
My question here is does everything look right in the conversion? What could I be doing different with the converted model to get similar performance?
I've started to dig around in the Keras source code to see what the model compile function is doing with targets/outputs, but was going to reach out here as well to see if anyone has a suggestion/ran into this before.

The problem is probably that you're applying two softmax activations in the TensorFlow version. Note that tf.losses.softmax_cross_entropy expects unscaled logits. You could do the following:
logits = tf.keras.layers.Dense(
NUM_CLASSES)(flattened)
predictions = tf.keras.layers.Activation(
'softmax')(logits)
loss = tf.losses.softmax_cross_entropy(labels, logits)

Related

Transformer didn't work well with tensorflow gradient tape

I implemented transformer with tensorflow 2.0. The model works well when I train the model with model.fit(dataset)
However, when I train the model with tensorflow.GradientTape and evaluate it, the model yields blank space token for all inputs. Here is my code, and tensorflow version is 2.7.0
def loss_function(y_true, y_pred):
y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')(y_true, y_pred)
mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
loss = tf.multiply(loss, mask)
return tf.reduce_mean(loss)
for epoch in range(num_epochs):
for step, data in enumerate(dataset):
enc_inputs, dec_inputs, outputs = data[0]['inputs'], data[0]['dec_inputs'], data[1]['outputs']
with tf.GradientTape() as tape:
logits = model([enc_inputs, dec_inputs], training = True)
loss = loss_function(outputs, logits)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
I think there is no problem with my transformer model code, because it works well with model.fit(dataset). What's wrong with my code?

Getting constant accuracies for training and validation sets despite their losses are changing during CNN training?

As the title clearly describes the issue I've been experiencing during the training of my CNN model, the accuracies of training and validation sets are constant despite the losses of them are changing. I have included the detail regarding the model and its training setup below. What may cause this issue?
Here is the data that was used by training (X_train & y_train), validation, and test sets (X_test and y_test):
df = pd.read_csv(CSV_PATH, sep=',', header=None)
print(f'Shape of all data: {df.shape}')
y = df.iloc[:, -1].values
X = df.iloc[:, :-1].values
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
dummy_y = to_categorical(encoded_Y)
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=RANDOM_STATE)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
Here are the shapes of training and test sets:
Shape of X_train: (1322, 10800, 1)
Shape of Y_train: (1322, 3)
Shape of X_test: (567, 10800, 1)
Shape of y_test: (567, 3)
Here is my CNN model:
# Model hyper-parameters
activation_fn = 'relu'
n_lr = 1e-4
weight_decay = 1e-4
batch_size = 64
num_epochs = 200*10*10
num_classes = 3
n_dropout = 0.6
n_momentum = 0.5
n_kernel = 5
n_reg = 1e-5
# the sequential model
model = Sequential()
model.add(Conv1D(128, n_kernel, input_shape=(10800, 1)))
model.add(BatchNormalization())
model.add(Activation(activation_fn))
model.add(MaxPooling1D(pool_size=2, strides=2))
model.add(Dropout(n_dropout))
model.add(Conv1D(256, n_kernel))
model.add(BatchNormalization())
model.add(Activation(activation_fn))
model.add(MaxPooling1D(pool_size=2, strides=2))
model.add(Dropout(n_dropout))
model.add(GlobalAveragePooling1D()) # have tried model.add(Flatten()) as well
model.add(Dense(256, activation=activation_fn))
model.add(Dropout(n_dropout))
model.add(Dense(64, activation=activation_fn))
model.add(Dropout(n_dropout))
model.add(Dense(num_classes, activation='softmax'))
adam = Adam(lr=n_lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=weight_decay)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
Here is how I have evaluated the model:
Y_pred = model.predict(X_test, verbose=0)
y_pred = np.argmax(Y_pred, axis=1)
y_test_int = np.argmax(y_test, axis=1)
And, my model always predicts the same class of three classes during the model evaluation as you can see from the classification result below (via classification_result(y_test_int, y_pred) function):
precision recall f1-score support
normal 0.743 1.000 0.852 421
apb 0.000 0.000 0.000 45
pvc 0.000 0.000 0.000 101
The model was trained using the EarlyStopping callback of Keras. Thus, the training has continued for 4,173 epochs. Here is the obtained losses during the training for training and validation sets:
Here are the obtained accuracies during the training for training and validation sets:
The model was implemented using Keras and hosted on Google Colab.
Although such issues are difficult to resolve without the data, there are a couple of general rules applicable.
The very first thing we do when the model does not seem to learn anything, like here (despite the mild drop in the loss), is to remove all dropout.
In fact, dropout is not supposed to be used by default; its nominal function is to guard against overfitting - but of course, before starting to worry about overfitting, you must first have some success with fitting, something that is clearly not happening here. The fact that, with a dropout rate of n_dropout = 0.6, you also seem to be rather too aggressive in its use, does not help, either.

tensorflow 2 evaluate inconsistent with sklearn accuracy_score

I try to train a model to predict gender using Celeba dataset and tensorflow.
This is my model:
train_data_gen = train_image_generator.flow_from_dataframe(
dataframe=train_split,
directory=celeba.images_folder,
x_col='id',
y_col='Male',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=batch_size,
classes=['1', '0']
)
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
include_top=False,
weights='imagenet')
model = tf.keras.Sequential([
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(2),
tf.keras.layers.Softmax()
])
base_learning_rate = 0.001
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=base_learning_rate),
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
Then I use the following to evaluate the model
test_data_gen = test_image_generator.flow_from_dataframe(
dataframe=test_split,
directory=celeba.images_folder,
x_col='id',
y_col='Male',
target_size=(IMG_WIDTH, IMG_HEIGHT),
batch_size=batch_size,
classes=['1', '0']
)
model = tf.keras.models.load_model("cp-0004.ckpt")
# Re-evaluate the model
loss, acc = model.evaluate(test_data_gen, verbose=2)
which gives accuracy of 0.87
But when I use the following, I get 0.51 accuracy!
pred_test = model.predict(test_data_gen)
pred_df = pd.DataFrame(pred_test, columns=["Male", "Female"])
pred_df[pred_df > 0.5] = "1"
pred_df[pred_df < 0.5] = "0"
# test_split_raw = celeba.split('test', drop_zero=False)
confusion_matrix(test_split["Male"].astype(int).values, np.argmax(pred_df.values, 1))
Can anyone explain why the accuracy from the evaluate function is different?
You want to check test_image_generator.flow_from_dataframe. The default value of shuffle is set to True.
Your generator object therefore yields randomly from your test data.
Your model then predicts for those randomly generated images, but you compare to your ordered dataframe. If you want to compare to test_split["Male"] set shuffle to False. If you don't set shuffle to False you will always get ~0.5 accuracy (If your data is equally distributed)
Another hint: You should use the .evaluate() method if you have labeled data. Using .evaluate() also yields accuracy.
Use .predict() only for new, unlabeled data.

Different results from Tensorflow and Keras

I get different results from Tensorflow and Keras with the same network structure.
The loss function looks like
class MaskedMultiCrossEntropy(object):
def loss(self, y_true, y_pred):
vec = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true, dim=1)
mask = tf.equal(y_true[:,0,:], -1)
zer = tf.zeros_like(vec)
loss = tf.where(mask, x=zer, y=vec)
return loss
The network layer I used is called CrowdsClassification, which is implemented by Keras. Then I build the network by
x = Dense(128, input_shape=(input_dim,), activation='relu')(inputs)
x = Dropout(0.5)(x)
x = Dense(N_CLASSES)(x)
x = Activation("softmax")(x)
crowd = CrowdsClassification(num_classes, num_oracles, conn_type="MW")
x = crowd(x)
Train the model with Keras
model = Model(inputs=inputs, outputs=x)
model.compile(optimizer='adam', loss=loss)
model.fit(inputs,
true_class, epochs=100, shuffle=False, verbose=2, validation_split=0.1))
Train the model with tensorflow
optimizer = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opt_op = optimizer.minimize(loss, global_step=global_step)
sess.run(tf.global_variables_initializer())
for epoch in range(100):
sess.run([loss, opt_op], feed_dict=train_feed_dict)
The Tensorflow will get a wrong prediction. It seems that the issue comes from the loss function, that Tensorflow cannot backproporgate the masked loss. Anyone can give some advices? Thx a lot.

Step is always 0 when using tf.estimator.Estimator

I have been trying to learn the layers and estimators framework that was recently moved from contrib to main API. I ran into a rather strange problem. I wrote a simple autoencoder for MNIST, but somehow, when I train it keeps saying I am at step 0 even though the loss value is decreasing, so I guess the model is getting trained. Of course, since it is not counting steps, it is not saving the checkpoints and it is not saving any summaries either. Not sure what I am doing wrong and all the docs point to the old "tf.contrib.learn" framework and a lot of APIs there seem to be marked as deprecated. How do I make this work? Here is my code:
def encoder(x):
l1 = tf.layers.dense(x, 256, activation=tf.nn.relu, name='encode1')
l2 = tf.layers.dense(l1, 128, activation=tf.nn.relu, name='encode2')
return l2
def decoder(x):
l1 = tf.layers.dense(x, 256, activation=tf.nn.relu, name='decode1')
l2 = tf.layers.dense(l1, 784, activation=tf.nn.relu, name='decode2')
return l2
def loss(labels, preds):
return tf.losses.huber_loss(labels, preds)
def train(loss):
optimizer = tf.train.AdamOptimizer()
return optimizer.minimize(loss)
def model_fn(features, labels, mode):
_encoder = encoder(features)
_decoder = decoder(_encoder)
_loss = loss(labels, _decoder)
_train = train(_loss)
return tf.estimator.EstimatorSpec(mode=mode,
predictions=_decoder,
loss=_loss,
train_op=_train)
data = input_data.read_data_sets(".", one_hot=True)
display.clear_output()
# remove current log dir
shutil.rmtree('logs', ignore_errors=True)
def input_fn():
if data.train.epochs_completed <= 10:
features, labels = data.train.next_batch(100)
return tf.constant(features), tf.constant(features)
raise StopIteration
estimator = tf.estimator.Estimator(model_fn, "logs")
estimator.train(input_fn=input_fn)
And here is some sample output
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'logs', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 0 into logs/model.ckpt.
INFO:tensorflow:loss = 0.0505481, step = 0
INFO:tensorflow:loss = 0.00319921, step = 0 (1.125 sec)
INFO:tensorflow:loss = 0.00277268, step = 0 (1.094 sec)
INFO:tensorflow:loss = 0.00275822, step = 0 (1.106 sec)
INFO:tensorflow:loss = 0.00275116, step = 0 (1.069 sec)
INFO:tensorflow:loss = 0.00275018, step = 0 (1.130 sec)
INFO:tensorflow:loss = 0.00274921, step = 0 (1.161 sec)
INFO:tensorflow:loss = 0.00274908, step = 0 (1.140 sec)
INFO:tensorflow:loss = 0.00274683, step = 0 (1.105 sec)
INFO:tensorflow:loss = 0.00274397, step = 0 (1.111 sec)
In the training op you need to set the global_step parameter, which is the step counter that gets incremented for each model training run. So change to :
optimizer.minimize(loss, global_step=tf.train.get_global_step())