BERT WITH BiDirectional LSTM loss is not decreasing - tensorflow

I am trying port Keras Semantic Similarity example to Pytorch Lightening. I followed all the necessary steps from keras example but somehow loss not at all decreasing. i am just curious where did i make mistake?
Here is the colab link
PyTorch Model
MAX_LENGTH = 128
from transformers import BertModel,AdamW
import pytorch_lightning as pl
class BertWithLSTM(pl.LightningModule):
def __init__(self,train_path,test_path, val_path,num_classes):
super().__init__()
self.bert = BertModel.from_pretrained("bert-base-uncased", output_attentions=True)
self.bert.trainable = False
self.lstm = nn.LSTM(input_size=768, hidden_size=64, batch_first=True, bidirectional=True, dropout=0.3)
self.dropout = nn.Dropout(p=0.3)
self.output = nn.Linear(MAX_LENGTH*2,3)
self.num_classes = num_classes
self.train_path = train_path
self.test_path = test_path
self.val_path = val_path
self.criterion = nn.CrossEntropyLoss()
def train_dataloader(self):
train_dataset = CustomDataset(csv_path=self.train_path)
return DataLoader(dataset=train_dataset, batch_size=32, shuffle=False)
def test_dataloader(self):
test_dataset = CustomDataset(csv_path=self.test_path)
return DataLoader(dataset=test_dataset)
def val_dataloader(self):
val_dataset = CustomDataset(csv_path=self.val_path)
return DataLoader(dataset=val_dataset, batch_size=16, shuffle=True)
def training_step(self,train_batch, batch_idx):
input_ids,attention_mask,token_type_ids,labels = train_batch
output = self(input_ids, attention_mask, token_type_ids, labels)
loss = F.cross_entropy(output, labels)
self.log("training_loss", loss, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self,val_batch, batch_idx):
input_ids,attention_mask,token_type_ids,labels = val_batch
output = self(input_ids, attention_mask, token_type_ids, labels)
loss = F.cross_entropy(output, labels)
self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
def forward(self, input_ids,attention_mask,token_type_ids, label ):
bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
sequence_output = bert_output.last_hidden_state
output, (hidden_state, c_n) = self.lstm(sequence_output)
avg_pool = torch.mean(output, dim=1)
max_pool,indices = torch.max(output,dim=1)
output = torch.cat((avg_pool,max_pool),1)
output = self.output(output)
return output
def configure_optimizers(self):
optimizer = torch.optim.AdamW(self.parameters())
return optimizer

Related

HuggingFace Pytorch trainer giving worse results than tensorflow

I’m trying to make the switch from tensorflow to pytorch, but I’m getting a good bit worse results when running a model in pytorch using Trainer.
I’m using bert-base-uncased, and as far as I can tell am using primarily the same settings across both (batch size, epochs, learning rate, etc). However I am getting a f1 score of 0.9967 from tensorflow, and a 0.944649446494465 from pytorch. The loss also seems to fluctuate a lot more in pytorch. I’m still pretty new to machine learning and python in general, so I feel like it’s gotta be something obvious, but I’ve yet to find it. Here are my scripts. Thanks in advance.
Tensorflow
SEQ_LEN = 256
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def train():
def preprocess_function(examples):
return tokenizer(examples["text"], max_length=SEQ_LEN, truncation=True, padding='max_length', add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
dataset = load_dataset('json', data_files={"train": "full-items.json", "test": "validation-2.json"})
tokenized = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
batch_size = 8
num_epochs = 4
batches_per_epoch = len(tokenized["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=4e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = TFAutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
tf_train_set = model.prepare_tf_dataset(
tokenized["train"],
shuffle=True,
batch_size=batch_size,
collate_fn=data_collator,
)
tf_validation_set = model.prepare_tf_dataset(
tokenized["test"],
shuffle=False,
batch_size=batch_size,
collate_fn=data_collator,
)
eval_metrics = evaluate.load("f1")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return eval_metrics.compute(predictions=predictions, references=labels)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
METRICS = [
tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
tf.keras.metrics.SparseCategoricalCrossentropy(from_logits=True, name='sparse_crossentropy'),
]
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_train_set)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
class_weights = dict(enumerate(sklearn.utils.class_weight.compute_class_weight('balanced',
classes=np.unique(tokenized["train"]["label"]),
y=tokenized["train"]["label"])))
model.compile(optimizer=optimizer, loss=loss, metrics=METRICS)
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, class_weight=class_weights, callbacks=[early_stop, metric_callback])
model.save_pretrained('lease_to_own_model', save_format="tf")
Pytorch
def pyTorch():
def preprocess_function(examples):
return tokenizer(examples["text"], max_length=SEQ_LEN, truncation=True, padding='max_length', add_special_tokens=True, return_attention_mask=True, return_token_type_ids=False)
dataset = load_dataset('json', data_files={"train": "full-items.json", "test": "validation-2.json"})
tokenized = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_f1 = evaluate.load("f1")
eval_accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
f1 = eval_f1.compute(predictions=predictions, references=labels)
accuracy = eval_accuracy.compute(predictions=predictions, references=labels)
return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
device = torch.device("cuda")
model.to(device)
batch_size = 8
training_args = TrainingArguments(
num_train_epochs=4,
output_dir="pytorch",
learning_rate=4e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
evaluation_strategy="epoch",
save_strategy="epoch",
metric_for_best_model='f1',
load_best_model_at_end=True,
logging_strategy="epoch",
warmup_steps=0,
)
class_weights = sklearn.utils.class_weight.compute_class_weight('balanced',
classes=np.unique(tokenized["train"]["label"]),
y=tokenized["train"]["label"])
weights= torch.tensor(class_weights,dtype=torch.float).to(device)
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get("logits")
loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("pytorch")

tfr.keras.losses.ListMLELoss() is always 0 during training, validation, and testing

I have made a ranking model using tensorflow_ranking losses and metrics, but the ListMLELoss() is always 0. The model will train and complete, but I imagine no learning is actually happening since the loss is not getting calculated. I tried to follow this guide, https://www.tensorflow.org/recommenders/examples/listwise_ranking, as well as I could, but there are some differences in use cases so it is a bit different. I am not sure why model.fit() runs and I get an NDCG value, but clearly the model cannot be learning as a loss value is not getting computed.
Here is my ranking model class:
class RankingModel(tf.keras.Model):
def __init__(self, embeddings, vocab_size_dict, dim_dict, loss, activation='sigmoid'):
super().__init__()
self.embeddings = embeddings
self.embedding_layers = {}
self.vocab_size_dict = vocab_size_dict
self.dim_dict = dim_dict
self.activation = activation
self.loss = loss
self.embedding_layers['feature_one'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_one']+1,
self.dim_dict['feature_one'],
name='embedded_feature_one')
self.embedding_layers['feature_two'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_two']+1,
self.dim_dict['feature_two'],
name='embedded_feature_two')
self.embedding_layers['feature_three'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_three']+1,
self.dim_dict['feature_three'],
name='embedded_feature_three')
self.embedding_layers['feature_four'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_four']+1,
self.dim_dict['feature_four'],
name='embedded_feature_four')
self.embedding_layers['feature_five'] = tf.keras.layers.Embedding(
self.vocab_size_dict['feature_five']+1,
self.dim_dict['feature_five'],
name='embedded_feature_five')
self.flatten = tf.keras.layers.Flatten()
self.concatenate = tf.keras.layers.Concatenate(axis=1, name='Input_Concatenation')
self.batchnorm = tf.keras.layers.BatchNormalization(name='batchnorm')
self.score_model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='leaky_relu'),
tf.keras.layers.Dense(12, activation=activation)
])
self.task = tfrs.tasks.Ranking(
loss=self.loss,
metrics=[
tfr.keras.metrics.NDCGMetric(name="ndcg_metric")
]
)
def __call__(self, features, training=False):
feats = []
for feat, tens in features[0].items():
if feat in self.embeddings:
embedding = self.embedding_layers[feat](tens)
flatten = self.flatten(embedding)
feats.append(flatten)
if feat == 'continuous':
flatten = self.flatten(tens)
feats.append(flatten)
deep_concatenated = self.concatenate(feats)
batchnorm = self.batchnorm(deep_concatenated)
scores = self.score_model(batchnorm)
print("scores: ", scores)
print("mask: ", features[0]['mask'])
masked_scores = tf.boolean_mask(scores, features[0]['mask'])
# pred = tf.expand_dims(masked_scores, axis=1)
# return pred
return tf.expand_dims(masked_scores, axis=1)
def compute_loss(self, features, training=False):
labels = features[1]
# print("labels: ", labels)
# print("mask: ", features[0]['mask'])
masked_labels = tf.boolean_mask(labels, features[0]['mask'])
# print("masked labels:", masked_labels)
masked_labels = tf.expand_dims(masked_labels, axis=1)
print("masked_labels: ", masked_labels)
scores = self(features)
print("scores: ", scores)
print("loss: ", self.task(labels=masked_labels, predictions=scores))
return self.task(
labels=masked_labels,
predictions=scores
)
def train_step(self, inputs):
"""Custom train step using the `compute_loss` method."""
with tf.GradientTape() as tape:
loss = self.compute_loss(inputs)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
gradients = tape.gradient(total_loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
def test_step(self, inputs):
"""Custom test step using the `compute_loss` method."""
loss = self.compute_loss(inputs)
# Handle regularization losses as well.
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
Can anybody see why I am not getting a loss value? Thanks a lot. Please let me know if you need additional info. Maybe I can create some synthetic data so you can run it all yourself. Been pulling my hair out for a few days trying to get this to work so any advice is MUCH appreicated.

MT5ForConditionalGeneration with Pytorch-lightning gives attribute_error

Can't handle that problem for several days
I'm new to NLP and the solution is probably very simple
class QAModel(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
def forward(self, input_ids, attention_mask, labels=None):
output = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
return output.loss, output.logits
def training_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['labels']
loss, outputs = self(input_ids, attention_mask, labels)
self.log('train_loss', loss, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['labels']
loss, outputs = self(input_ids, attention_mask, labels)
self.log('val_loss', loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['labels']
loss, outputs = self(input_ids, attention_mask, labels)
self.log('test_loss', loss, prog_bar=True, logger=True)
return loss
def configure_optimizers(self):
return AdamW(self.parameters(), lr=0.0001)
model = QAModel()
from pytorch_lightning.callbacks import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(
dirpath='/content/checkpoints',
filename='best-checkpoint',
save_top_k=1,
verbose=True,
monitor='val_loss',
mode='min'
)
trainer = pl.Trainer(
checkpoint_callback=checkpoint_callback,
max_epochs=N_EPOCHS,
gpus=1,
progress_bar_refresh_rate=30
)
trainer.fit(model, data_module)
Running this code gives me
AttributeError: 'QAModel' object has no attribute 'automatic_optimization'
after fit() function
Probably, the problem is in MT5ForConditionalGeneration, as after passing it to funtion() we've got the same error
Try inheriting pl.LightingModule instead of pl.LightningDataModule. It is the right choice for defining a model class.

tf.estimator.train_and_evaluate got wrong Evaluation accuray and loss

I use tf.estimator.train_and_evaluate to train and evaluate my model. This is my code:
import tensorflow as tf
import numpy as np
from tensorflow.contrib.slim.nets import resnet_v2
import tensorflow.contrib.slim as slim
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path='mnist.npz')
x_train = np.expand_dims(x_train, 3).astype(np.float32)[:5000]
y_train = y_train.astype(np.int32)[:5000]
x_test = np.expand_dims(x_test, 3).astype(np.float32)[:1000]
y_test = y_test.astype(np.int32)[:1000]
tf.logging.set_verbosity(tf.logging.INFO)
cls_num = 10
def model_fn(features, labels, mode):
is_training = False
if mode == tf.estimator.ModeKeys.TRAIN:
is_training = True
with slim.arg_scope(resnet_v2.resnet_arg_scope()):
logits, endpoints = resnet_v2.resnet_v2_50(features,
num_classes=cls_num,
is_training=is_training,
reuse=None)
logits = tf.squeeze(logits, [1, 2])
preds = tf.argmax(logits, 1)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
accuracy = tf.metrics.accuracy(labels=labels, predictions=preds)
metrics = {'accuracy': accuracy}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def process_fn(feature, label):
feature = tf.expand_dims(feature, 3)
return feature, label
def train_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
#dataset.map(process_fn)
dataset = dataset.repeat(1).batch(8)
return dataset
def eval_input_fn():
dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
dataset = dataset.repeat(1).batch(8)
return dataset
estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir='logs')
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn)
eval_specs = tf.estimator.EvalSpec(input_fn=eval_input_fn)
for _ in xrange(10):
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs)
The training step is ok and the loss became very small (about 0.001), but the evaluation result is wrong (the following is the evaluaiton log):
...
INFO:tensorflow:Saving dict for global step 625: accuracy = 0.5, global_step = 625, loss = 1330830600000.0
...
The task is very simple, just a binaray classfication. I don not think it is overfitting. Is there something wrong for my evaluation code?

How to input csv data in an autoencoder

I am using the code below that implements an autoencoder. How can I feed the autoencoder with data for training and testing?
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
class Autoencoder(object):
def __init__(self, n_input, n_hidden, transfer_function=tf.nn.softplus, optimizer = tf.train.AdamOptimizer()):
self.n_input = n_input
self.n_hidden = n_hidden
self.transfer = transfer_function
network_weights = self._initialize_weights()
self.weights = network_weights
# model
self.x = tf.placeholder(tf.float32, [None, self.n_input])
self.hidden = self.transfer(tf.add(tf.matmul(self.x, self.weights['w1']), self.weights['b1']))
self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])
# cost
self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))
self.optimizer = optimizer.minimize(self.cost)
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
def _initialize_weights(self):
all_weights = dict()
all_weights['w1'] = tf.get_variable("w1", shape=[self.n_input, self.n_hidden],
initializer=tf.contrib.layers.xavier_initializer())
all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
return all_weights
def partial_fit(self, X):
cost, opt = self.sess.run((self.cost, self.optimizer), feed_dict={self.x: X})
return cost
def calc_total_cost(self, X):
return self.sess.run(self.cost, feed_dict = {self.x: X})
def transform(self, X):
return self.sess.run(self.hidden, feed_dict={self.x: X})
def generate(self, hidden = None):
if hidden is None:
hidden = self.sess.run(tf.random_normal([1, self.n_hidden]))
return self.sess.run(self.reconstruction, feed_dict={self.hidden: hidden})
def reconstruct(self, X):
return self.sess.run(self.reconstruction, feed_dict={self.x: X})
def getWeights(self):
return self.sess.run(self.weights['w1'])
def getBiases(self):
return self.sess.run(self.weights['b1'])
# I instantiate the class autoencoder, 5 is the dimension of a raw input,
2 is the dimension of the hidden layer
autoencoder = Autoencoder(5, 2, transfer_function=tf.nn.softplus, optimizer
= tf.train.AdamOptimizer())
# I prepare my data**
IRIS_TRAINING = "C:\\Users\\Desktop\\iris_training.csv"
#Feeding data to Autoencoder ???
Train and Test ??
How can I train this model with csv file data? I think I need to run the following instruction as _, c = sess.run([optimizer, cost], feed_dict={self.x: batch_ofd_ata}) inside a loop of epochs, but I am struggling with it.
Check out Stanford CS20SI's tutorial.
https://github.com/chiphuyen/tf-stanford-tutorials/blob/master/examples/05_csv_reader.py