I implemented transformer with tensorflow 2.0. The model works well when I train the model with model.fit(dataset)
However, when I train the model with tensorflow.GradientTape and evaluate it, the model yields blank space token for all inputs. Here is my code, and tensorflow version is 2.7.0
def loss_function(y_true, y_pred):
y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')(y_true, y_pred)
mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
loss = tf.multiply(loss, mask)
return tf.reduce_mean(loss)
for epoch in range(num_epochs):
for step, data in enumerate(dataset):
enc_inputs, dec_inputs, outputs = data[0]['inputs'], data[0]['dec_inputs'], data[1]['outputs']
with tf.GradientTape() as tape:
logits = model([enc_inputs, dec_inputs], training = True)
loss = loss_function(outputs, logits)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
I think there is no problem with my transformer model code, because it works well with model.fit(dataset). What's wrong with my code?
Related
Consider the following toy model:
class MyModel(keras.Model):
def __init__(self, **kwargs):
super(MyModel, self).__init__(**kwargs)
self.square_layer = keras.layers.Dense(2)
self.cube_layer = keras.layers.Dense(2)
self.optimizer = tf.keras.optimizers.Adam()
#tf.function
def call(self, X):
return tf.stack([self.square_layer(X), self.cube_layer(X)], axis=-1)
#tf.function
def train_step(self, inputs, targets):
with tf.GradientTape() as tape:
predictions = self(inputs)
loss = tf.reduce_mean(tf.square(predictions - targets))
grads = tape.gradient(loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
return loss
If we train using the following 'train' function, and set 'self.cube_layer.trainable' as True or False, the result is as expected in both the cases:
def train(self, inputs, targets, num_epochs=5000):
self.cube_layer.trainable = False # True or False
self.compile(optimizer=self.optimizer)
for epoch in range(num_epochs):
loss = self.train_step(inputs, targets)
print("Loss: " +str(loss))
inputs = tf.constant([[1,2]], dtype=tf.float32)
targets = tf.constant([[[3,6], [9,12]]], dtype=tf.float32)
model = MyModel()
model.train(inputs, targets)
print(model(inputs))
But, if we change the 'trainable' flag during training, the result is not as expected:
def train(self, inputs, targets, num_epochs=5000):
self.cube_layer.trainable = False
self.compile(optimizer=self.optimizer)
for epoch in range(num_epochs):
loss = self.train_step(inputs, targets)
self.cube_layer.trainable = True
self.compile(optimizer=self.optimizer)
for epoch in range(num_epochs):
loss = self.train_step(inputs, targets)
print("Loss: " +str(loss))
inputs = tf.constant([[1,2]], dtype=tf.float32)
targets = tf.constant([[[3,6], [9,12]]], dtype=tf.float32)
model = MyModel()
model.train(inputs, targets)
print(model(inputs))
In the above example, if we remove the '#tf.function' decorators from 'call' and 'train_step', the result is as expected ! So, I believe it has something to do with tf.function and tensorflow graph compilation.
Is there a way we can use tf.function and set the 'trainable' attribute dynamically during training ? I am using tensorflow 2.9.1.
This is a very intersting and significant problem. Let's locate the problem by adding 3 print line and do a little test in epoch 5, basing on the last train func in your question decalration. i.e.:
...
#tf.function
def train_step(self, inputs, targets):
with tf.GradientTape() as tape:
predictions = self(inputs)
loss = tf.reduce_mean(tf.square(predictions - targets))
grads = tape.gradient(loss, self.trainable_variables)
tf.print(len(self.trainable_variables),"in graph") # add
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
return loss
...
def train(self, inputs, targets, num_epochs=5):
self.cube_layer.trainable = False
print(len(self.trainable_variables),"before frozen") # add
self.compile(optimizer=self.optimizer)
for epoch in range(num_epochs):
loss = self.train_step(inputs, targets)
self.cube_layer.trainable = True
print(len(self.trainable_variables),"after frozen") # add
self.compile(optimizer=self.optimizer)
for epoch in range(num_epochs):
loss = self.train_step(inputs, targets)
output is:
0 before frozen
2 in graph
2 in graph
2 in graph
2 in graph
2 in graph
4 after frozen
2 in graph
2 in graph
2 in graph
2 in graph
2 in graph
Wow~, even you have changed cube_layer's flag and influence model.trainable_variables indeed, but did not influence the train_step.
Because in this code, train_step has been converted into graph and will not be converted again. It does not mean that once a function is converted into a calculation graph, it will always remain unchanged.
😊The deep reason istf.function's Tracing mechanism. If you repeatedly call a Graphed Function with the same argument type, TensorFlow will skip the tracing stage and reuse a previously traced graph, as the generated graph would be identical. Obviously, here the input of train_step did not change, so we cannot get a new different Graphed Function, leading invalid modification of self.cube_layer.trainable.
So, let's fix it. In fact, it's not a bug, because we'd better not mix high-level(compile,fit) and medium-level(tf.GradientTape) APIs. model.compileonly works for model.fit and did nothing here.
So, a better way here can be write as:
class MyModel(tf.keras.Model):
def __init__(self, **kwargs):
super(MyModel, self).__init__(**kwargs)
self.square_layer = tf.keras.layers.Dense(2)
self.cube_layer = tf.keras.layers.Dense(2)
self.optimizer = tf.keras.optimizers.Adam()
#tf.function
def call(self, X):
return tf.stack([self.square_layer(X), self.cube_layer(X)], axis=-1)
#tf.function
def train_step1(self, inputs,targets,):
with tf.GradientTape() as tape:
predictions = self(inputs)
loss = tf.reduce_mean(tf.square(predictions - targets))
grads = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
return loss
#tf.function
def train_step2(self, inputs,targets):
with tf.GradientTape() as tape:
predictions = self(inputs)
loss = tf.reduce_mean(tf.square(predictions - targets))
grads = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
return loss
def train(self, inputs, targets, num_epochs=5000):
self.cube_layer.trainable = False
self.train_step = self.train_step1
for epoch in range(num_epochs):
loss = self.train_step(inputs,targets)
self.cube_layer.trainable = True
self.train_step = self.train_step2
for epoch in range(num_epochs):
loss = self.train_step(inputs,targets)
print("Loss: " +str(loss))
inputs = tf.constant([[1,2]], dtype=tf.float32)
targets = tf.constant([[[3,6], [9,12]]], dtype=tf.float32)
model = MyModel()
model.train(inputs, targets)
print(model(inputs))
And anything is OK:
Loss: tf.Tensor(1.351493e-06, shape=(), dtype=float32)
tf.Tensor(
[[[ 3. 5.9999933]
[ 8.999994 11.997685 ]]], shape=(1, 2, 2), dtype=float32)
I am adding a custom loss to a VAE, as suggested here: https://www.linkedin.com/pulse/supervised-variational-autoencoder-code-included-ibrahim-sobh-phd/
Instead of defining a loss function, it uses a dense network and takes its output as the loss (if I understand correctly).
# New: add a classifier
clf_latent_inputs = Input(shape=(latent_dim,), name='z_sampling_clf')
clf_outputs = Dense(10, activation='softmax', name='class_output')(clf_latent_inputs)
clf_supervised = Model(clf_latent_inputs, clf_outputs, name='clf')
clf_supervised.summary()
# instantiate VAE model
# New: Add another output
outputs = [decoder(encoder(inputs)[2]), clf_supervised(encoder(inputs)[2])]
vae = Model(inputs, outputs, name='vae_mlp')
vae.summary()
reconstruction_loss = binary_crossentropy(inputs, outputs[0])
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean((reconstruction_loss + kl_loss) /100.0)
vae.add_loss(vae_loss)
# New: add the clf loss
vae.compile(optimizer='adam', loss={'clf': 'categorical_crossentropy'}) ===> this line <===
vae.summary()
# reconstruction_loss = binary_crossentropy(inputs, outputs)
svae_history = vae.fit(x_train, {'clf': y_train},
epochs=epochs,
batch_size=batch_size)
I was stuck at the compilation step (annotated as ===> this line <===) that I met a type error:
TypeError: Expected float32, got <function
BaseProtVAE.init..vae_loss at 0x7ff53051dd08> of type
'function' instead.
I need your help if you've got any suggestions.
There are several ways to implement VAE in Tensorflow. I propose an alternative implementation that can be found in custom_layers_and_models in Tensorflow guide pages :
Let's put all of these things together into an end-to-end example: we're going to implement a Variational AutoEncoder (VAE). We'll train it on MNIST digits.
It uses custom Model classes and the gradient tape. In this way, it is quite easy to add the classifier into the VAE model and add the categorical cross-entropy to the total loss during the optimization.
All you need is to modify:
class VariationalAutoEncoder(Model):
"""Combines the encoder and decoder into an end-to-end model for training."""
def __init__(
self,
original_dim,
intermediate_dim=64,
latent_dim=32,
name="autoencoder",
**kwargs
):
super(VariationalAutoEncoder, self).__init__(name=name, **kwargs)
self.original_dim = original_dim
self.encoder = Encoder(latent_dim=latent_dim, intermediate_dim=intermediate_dim)
self.decoder = Decoder(original_dim, intermediate_dim=intermediate_dim)
self.clf_supervised = Dense(10, activation='softmax', name='class_output')
def call(self, inputs):
z_mean, z_log_var, z = self.encoder(inputs)
reconstructed = self.decoder(z)
# Add KL divergence regularization loss.
kl_loss = -0.5 * tf.reduce_mean(
z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1
)
self.add_loss(kl_loss)
# classifier
y_pred = self.clf_supervised(z)
return reconstructed, y_pred
by adding the lines self.clf_supervised = Dense(10, activation='softmax', name='class_output') and y_pred = self.clf_supervised(z).
The optimization is done this way:
vae = VariationalAutoEncoder(original_dim, intermediate_dim, latent_dim)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
mse_loss_fn = tf.keras.losses.MeanSquaredError()
loss_metric = tf.keras.metrics.Mean()
epochs = 2
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=500).batch(4)
# Iterate over epochs.
for epoch in range(epochs):
print("Start of epoch %d" % (epoch,))
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
with tf.GradientTape() as tape:
reconstructed, y_pred = vae(x_batch_train)
clf_loss = tf.keras.losses.SparseCategoricalCrossentropy()(y_batch_train, y_pred)
# Compute reconstruction loss
loss = mse_loss_fn(x_batch_train, reconstructed)
loss += sum(vae.losses) # Add KLD regularization loss
loss += clf_loss
grads = tape.gradient(loss, vae.trainable_weights)
optimizer.apply_gradients(zip(grads, vae.trainable_weights))
loss_metric(loss)
if step % 100 == 0:
print("step %d: mean loss = %.4f" % (step, loss_metric.result()))
The rest of the code is in the link above. The main change is the optimization done with tf.GradientTape(). It's a bit more complicated than the fit method but it's still quite simple and very powerful.
I executed this excellent tutorial:
https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d
I understood most of it except where model is being created. I would like to know it and migrate to TF2 bert.
When he says "Basically we load the pre-trained model and then train the last layer for classification task.", does it mean that he is freezing all the other layers and fine-tuning the last layer? This is the relevant code (in TF1) which I am not able to understand:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, use_one_hot_embeddings):
"""Creates a classification model."""
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
# probabilities = tf.nn.softmax(logits, axis=-1) ### multiclass case
probabilities = tf.nn.sigmoid(logits)#### multi-label case
labels = tf.cast(labels, tf.float32)
tf.logging.info("num_labels:{};logits:{};labels:{}".format(num_labels, logits, labels))
per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, logits, probabilities)
I went through the TF2 fine tuning tutorials for BERT, but how do I achieve the same? I am able to train other models where step 1 is not required.
Use the official bert example :
https://www.tensorflow.org/tutorials/text/classify_text_with_bert
I have created custom loss (Weighted Absolute error) in keras but implementation doesn't work - I get an error ValueError: No gradients provided for any variable: ['my_model/conv2d/kernel:0', 'my_model/conv2d/bias:0'].
I want to apply different weight for each pixel.
class WeightedMeanAbsoluteError(tf.keras.metrics.Metric):
def __init__(self, name='weighted_mean_absolute_error'):
super(WeightedMeanAbsoluteError, self).__init__(name=name)
self.wmae = self.add_weight(name='wmae', initializer='zeros')
def update_state(self, y_true, y_pred, loss_weights):
values = tf.math.abs(y_true - y_pred) * loss_weights
return self.wmae.assign_add(tf.reduce_sum(values))
def result(self):
return self.wmae
def reset_states(self):
# The state of the metric will be reset at the start of each epoch.
self.wmae.assign(0.)
loss_object = WeightedMeanAbsoluteError()
train_loss = WeightedMeanAbsoluteError()
I use the following code to implement a training step:
#tf.function
def train_step(input_images, output_images):
with tf.GradientTape() as tape:
# training=True is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
result_images = model(input_images, training=True)
loss = loss_object(output_images, result_images)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
Also my code works just fine if I use
loss_object = tf.keras.losses.MeanAbsoluteError()
train_loss = tf.keras.metrics.MeanAbsoluteError()
The best and simple way to minimize a weighted standard loss (such mae) is using the sample_weights parameter in fit method where we pass an array with the desired weight of each sample
X = np.random.uniform(0,1, (1000,50))
y = np.random.uniform(0,1, 1000)
W = np.random.randint(1,10, 1000)
inp = Input((50))
x = Dense(64, activation='relu')(inp)
out = Dense(10)(x)
model = Model(inp, out)
model.compile('adam','mae')
model.fit(X,y, epochs=100, sample_weights=W)
I get different results from Tensorflow and Keras with the same network structure.
The loss function looks like
class MaskedMultiCrossEntropy(object):
def loss(self, y_true, y_pred):
vec = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true, dim=1)
mask = tf.equal(y_true[:,0,:], -1)
zer = tf.zeros_like(vec)
loss = tf.where(mask, x=zer, y=vec)
return loss
The network layer I used is called CrowdsClassification, which is implemented by Keras. Then I build the network by
x = Dense(128, input_shape=(input_dim,), activation='relu')(inputs)
x = Dropout(0.5)(x)
x = Dense(N_CLASSES)(x)
x = Activation("softmax")(x)
crowd = CrowdsClassification(num_classes, num_oracles, conn_type="MW")
x = crowd(x)
Train the model with Keras
model = Model(inputs=inputs, outputs=x)
model.compile(optimizer='adam', loss=loss)
model.fit(inputs,
true_class, epochs=100, shuffle=False, verbose=2, validation_split=0.1))
Train the model with tensorflow
optimizer = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opt_op = optimizer.minimize(loss, global_step=global_step)
sess.run(tf.global_variables_initializer())
for epoch in range(100):
sess.run([loss, opt_op], feed_dict=train_feed_dict)
The Tensorflow will get a wrong prediction. It seems that the issue comes from the loss function, that Tensorflow cannot backproporgate the masked loss. Anyone can give some advices? Thx a lot.