BERT Model Fine Tuning and migrating to TF2 - tensorflow

I executed this excellent tutorial:
https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d
I understood most of it except where model is being created. I would like to know it and migrate to TF2 bert.
When he says "Basically we load the pre-trained model and then train the last layer for classification task.", does it mean that he is freezing all the other layers and fine-tuning the last layer? This is the relevant code (in TF1) which I am not able to understand:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, use_one_hot_embeddings):
"""Creates a classification model."""
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
# probabilities = tf.nn.softmax(logits, axis=-1) ### multiclass case
probabilities = tf.nn.sigmoid(logits)#### multi-label case
labels = tf.cast(labels, tf.float32)
tf.logging.info("num_labels:{};logits:{};labels:{}".format(num_labels, logits, labels))
per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, logits, probabilities)
I went through the TF2 fine tuning tutorials for BERT, but how do I achieve the same? I am able to train other models where step 1 is not required.

Use the official bert example :
https://www.tensorflow.org/tutorials/text/classify_text_with_bert

Related

Transformer didn't work well with tensorflow gradient tape

I implemented transformer with tensorflow 2.0. The model works well when I train the model with model.fit(dataset)
However, when I train the model with tensorflow.GradientTape and evaluate it, the model yields blank space token for all inputs. Here is my code, and tensorflow version is 2.7.0
def loss_function(y_true, y_pred):
y_true = tf.reshape(y_true, shape=(-1, MAX_LENGTH - 1))
loss = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')(y_true, y_pred)
mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
loss = tf.multiply(loss, mask)
return tf.reduce_mean(loss)
for epoch in range(num_epochs):
for step, data in enumerate(dataset):
enc_inputs, dec_inputs, outputs = data[0]['inputs'], data[0]['dec_inputs'], data[1]['outputs']
with tf.GradientTape() as tape:
logits = model([enc_inputs, dec_inputs], training = True)
loss = loss_function(outputs, logits)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
I think there is no problem with my transformer model code, because it works well with model.fit(dataset). What's wrong with my code?

Where do the input_ids, input_mask, and segment_ids variables come from in a BERT model?

I'm trying to work through the Google BERT tutorial in Google Colab, and am having a hard time following some of its steps.
Specifically, there is a function called create_model which reads like this:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
num_labels):
"""Creates a classification model."""
bert_module = hub.Module(
BERT_MODEL_HUB,
trainable=True)
bert_inputs = dict(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids)
bert_outputs = bert_module(
inputs=bert_inputs,
signature="tokens",
as_dict=True)
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
output_layer = bert_outputs["pooled_output"]
hidden_size = output_layer.shape[-1].value
# Create our own layer to tune for politeness data.
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
# Dropout helps prevent overfitting
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
# Convert labels into one-hot encoding
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
# If we're predicting, we want predicted labels and the probabiltiies.
if is_predicting:
return (predicted_labels, log_probs)
# If we're train/eval, compute loss between predicted and actual label
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, predicted_labels, log_probs)
This would be fine, but when I look for where the create_model function is called in the notebook, the origin of the input_ids, input_mask and segment_ids arguments is not clear to me.
This is where the function is referenced later on in the notebook:
def model_fn_builder(num_labels, learning_rate, num_train_steps,
num_warmup_steps):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
# TRAIN and EVAL
if not is_predicting:
(loss, predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
The problem here is that the features argument is not listed as an argument in the parent function, and it's not defined anywhere else in the notebook. So I'm not sure why it works, and even more importantly, I'm not sure what things like features['input_ids'] are supposed to represent. Without this being clear to me, it'll be difficult to make sense of what BERT actually does.
Thank you for your help.

Different results from Tensorflow and Keras

I get different results from Tensorflow and Keras with the same network structure.
The loss function looks like
class MaskedMultiCrossEntropy(object):
def loss(self, y_true, y_pred):
vec = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true, dim=1)
mask = tf.equal(y_true[:,0,:], -1)
zer = tf.zeros_like(vec)
loss = tf.where(mask, x=zer, y=vec)
return loss
The network layer I used is called CrowdsClassification, which is implemented by Keras. Then I build the network by
x = Dense(128, input_shape=(input_dim,), activation='relu')(inputs)
x = Dropout(0.5)(x)
x = Dense(N_CLASSES)(x)
x = Activation("softmax")(x)
crowd = CrowdsClassification(num_classes, num_oracles, conn_type="MW")
x = crowd(x)
Train the model with Keras
model = Model(inputs=inputs, outputs=x)
model.compile(optimizer='adam', loss=loss)
model.fit(inputs,
true_class, epochs=100, shuffle=False, verbose=2, validation_split=0.1))
Train the model with tensorflow
optimizer = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opt_op = optimizer.minimize(loss, global_step=global_step)
sess.run(tf.global_variables_initializer())
for epoch in range(100):
sess.run([loss, opt_op], feed_dict=train_feed_dict)
The Tensorflow will get a wrong prediction. It seems that the issue comes from the loss function, that Tensorflow cannot backproporgate the masked loss. Anyone can give some advices? Thx a lot.

Similar Keras code for a tensorflow model

The following code is a tensorflow code to create a model. It would be really helpful for me to have the exact alternative keras implementation of this code:
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
probabilities = tf.nn.softmax(logits, axis=-1)
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
Specially i'm confused about loss and optimizer, what kind of loss is used here. Can the following code be an alternative in keras:
sequence_input = Input(shape=(768,), dtype='float32')
preds = Dense(2, activation='softmax')(sequence_input)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])

RNN for Speech Emotion Recognition

I want to classify speech data into four different emotions (angry, sad, happy, neutral).
The problem is that when I run RNN code, all speech data classified into one class.
(For example, all speech data classified as "angry" all the time.)
I don't know what is the reason for this problem and what I have to change for training.
Here's my tensorflow RNN main function for training and calculating accuracy:
def RNN(x, weights, biases, lstm_size):
lstm_cell = []
for i in range(lstm_size):
lstm_cell.append(rnn.BasicLSTMCell(hidden_dim, forget_bias=1.0, state_is_tuple=True, activation=tf.nn.sigmoid))
stacked_lstm = tf.contrib.rnn.MultiRNNCell(lstm_cell, state_is_tuple=True)
outputs, states = tf.nn.dynamic_rnn(stacked_lstm, x, dtype=tf.float32)
foutput = tf.contrib.layers.fully_connected(outputs[:,-1], output_dim, activation_fn = None)
return foutput
logits = RNN(X, weights, biases, lstm_size)
prediction = tf.nn.sigmoid(logits)
cost =tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=Y))
learning_rate =tf.train.exponential_decay(learning_rate=initial_learning_rate, global_step=training_steps, decay_steps=training_steps/10, decay_rate=0.96, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)
pred = tf.argmax(prediction, axis=1)
label = tf.argmax(Y, axis=1)
correct_pred = tf.equal(pred, label)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float))
Input for RNN is speech features(pitch and MFCC) and output for RNN is one-hot code.(For example, angry=[1,0,0,0]).
Also, I wonder whether it is right or not to calculate classification accuracy like this.