Model with normalized binary cross entropy loss does not converge - tensorflow

I'm trying to implement normalized binary cross entropy for a classification task following this paper: Normalized Loss Functions for Deep Learning with Noisy Labels.
The math is as follows:
Here is my implementation:
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
if self.from_logits:
logits = tf.math.sigmoid(logits)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
numer = target * tf.math.log(logits) + (1 - target) * tf.math.log(1 - logits)
denom = - (tf.math.log(logits) + tf.math.log(1 - logits))
return - numer / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I'm using this loss to train a binary classifier (CTR predictor), but loss of the model does not decrease and ROC-AUC remains at ~0.49-0.5. To verify the implementation of numerator, I tried training by removing the denominator and it's working fine.
# Example Usage
labels = np.array([[0], [1], [0], [0], [0]]).astype(np.int64)
logits = np.array([[-1.024], [2.506], [1.43], [0.004], [-2.0]]).astype(np.float64)
tf_nce = NormalizedBinaryCrossentropy(
reduction=tf.keras.losses.Reduction.NONE,
from_logits=True
)
tf_nce(labels, logits)
#<tf.Tensor: shape=(5, 1), dtype=float64, numpy=
# array([[0.18737159],
# [0.02945536],
# [0.88459308],
# [0.50144269],
# [0.05631594]])>
I checked manually with some extremes and that loss doesn't hit nans or 0s.
Can anyone help me in debugging why the model is not able to converge on this loss? Is there something wrong with my understanding of the loss function or implementation?
Edit 1: Model architecture is a Multi-Gate Mixture-of-Experts with 6 tasks. All 6 tasks are binary classification and losses from all tasks are added together to get final loss.

One thing which is mentioned in the paper as described above is that the Norm of the loss should be inclusively in between [0 ~ 1] but as your loss is violating this condition of Normalized Binary Cross Entropy and the other reason is you are dividing by the wrong denominator, you have to divide it by the Cross-Entropy of your logits for this take the BinaryCrossEntropy() of your logits. so, these can be the reasons that your function is not decreasing... I have made some changes to your code that satisfy this Norm Property...
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
if self.from_logits:
numer = tf.keras.losses.binary_crossentropy(target, logits,from_logits=True)[:,tf.newaxis]
denom = -( tf.math.log(logits) + tf.math.log(1 - logits))
return numer * denom / tf.reduce_sum(denom)
else:
logits = tf.nn.log_softmax(logits)
num = - tf.math.reduce_sum(tf.multiply(target, logits), axis=1)
denom = -tf.math.reduce_sum(logits, axis=1)
return num / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I have updated the solution, there are two ways for computing the BCE if your logits are one-hot then set from_logit=False else set it True.

I would try to avoid log-Sigmoid stability issues and try to implement the above model as a 2 class problem with Softmax Binary cross entropy..
The NormalizedCrossEntropy is defined as:
class NormalizedCrossEntropy(keras.layers.Layer):
def __init__(self, num_classes):
super(NormalizedCrossEntropy, self).__init__()
self.num_classes = num_classes
def call(self, pred, labels):
pred = tf.nn.log_softmax(pred, axis=1,)
label_one_hot = tf.one_hot(labels, self.num_classes)
numer = -1 * tf.reduce_sum(label_one_hot * pred, axis=1)
denom = -1* tf.reduce_sum(pred, axis=1)
nce = numer/ denom
return nce
Example usage:
NormalizedCrossEntropy(num_classes=2)(np.array([[-1.024, 0.5], [0.1, 2.506], [1, .0], [0., 1.], [-0.89, -2.0]]), np.array([0, 1, 0, 0, 0]) )
#array([0.89725673, 0.03348167, 0.19259584, 0.80740416, 0.16958274]

Related

Why my chemical vae cannot learn any thing with toy dataset?

I m trying to implement a mini version of chemical vae referred in this paper: 10.1021/acscentsci.7b00572. The model can be successfully trained, and the loss is changing. However, the predicted properties of all samples are same, near to the mean value. And the autoencoder cannot reconstruct the input data. It means the model cannot learn anything by training. I have carefully check my codes, but failed to find any wrong. Can any one help? Thank you.
Here is my code:
import numpy as np
import tensorflow as tf
# example smiles and properties
smiles = ['CCCCO', 'C1CCCCC1', 'C[C##H](C(=O)O)N', 'C[C#H](C(=O)O)N', 'CC(=O)O'] * 200
y = [1,2,3,4,5] * 200
# smiles to one-hot
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
dicts = set(''.join(smiles))
num_words = len(dicts) + 1
max_lens = 15
tokenizer = Tokenizer(num_words=num_words, char_level=True)
tokenizer.fit_on_texts(smiles)
sequences = tokenizer.texts_to_sequences(smiles)
sequences = pad_sequences(sequences, maxlen = max_lens, padding='post', truncating='post')
x = to_categorical(sequences, num_classes=num_words)
# model
from tensorflow.keras import layers, Model
class VAEWithRegressor(Model):
"""Combines a variational autoencoder with a property regressor."""
def __init__(self, latent_dim):
super(VAEWithRegressor, self).__init__()
# Define the encoder layers
self.encoder = tf.keras.Sequential(
[
layers.InputLayer(input_shape=x[0].shape),
layers.GRU(units=64, return_sequences=True),
layers.BatchNormalization(),
layers.GRU(units=32),
layers.BatchNormalization(),
layers.Dense(units=16),
layers.BatchNormalization(),
layers.Dense(latent_dim * 2),
]
)
# Define the decoder layers
self.decoder = tf.keras.Sequential(
[
layers.InputLayer(input_shape=(latent_dim,)),
layers.Dense(units=16),
layers.BatchNormalization(),
layers.Dense(units=32),
layers.BatchNormalization(),
layers.RepeatVector(max_lens),
layers.GRU(units = max_lens, return_sequences=True),
layers.BatchNormalization(),
layers.TimeDistributed(layers.Dense(units=num_words)),
layers.Activation('softmax')
]
)
# Define the regressor layers
self.regressor = tf.keras.Sequential(
[
layers.InputLayer(input_shape=(latent_dim,)),
layers.Dense(units=32),
layers.Dense(units=16),
layers.Dense(units=1),
]
)
def encode(self, x):
# Compute the mean and log variance of the latent variable
h = self.encoder(x)
mean, log_var = tf.split(h, num_or_size_splits=2, axis=1)
return mean, log_var
def reparameterize(self, mean, log_var):
# Sample from the latent variable distribution
eps = tf.random.normal(tf.shape(mean))
std_dev = tf.exp(0.5 * log_var)
z = mean + std_dev * eps
return z
def decode(self, z):
# Reconstruct the input from the latent variable
return self.decoder(z)
def predict_properties(self, z):
# Predict the properties of the input
return self.regressor(z)
def call(self, x):
# Define the forward pass of the model
mean, log_var = self.encode(x)
z = self.reparameterize(mean, log_var)
x_pred = self.decode(z)
properties = self.predict_properties(z)
return x_pred, mean, log_var, properties
def vae_loss(self, x, x_pred, mean, log_var):
recon_loss = tf.reduce_sum(tf.keras.losses.binary_crossentropy(x, x_pred), axis = 1)
kl_loss = -0.5 * tf.reduce_sum(1 + log_var - tf.square(mean) - tf.exp(log_var), axis = 1)
return tf.reduce_mean(recon_loss + kl_loss)
def property_loss(self, y_true, y_pred):
# Compute the mean squared error between the true and predicted properties
return tf.reduce_mean(tf.keras.losses.mean_squared_error(y_true, y_pred))
def train_step(self, x, y_true):
with tf.GradientTape() as tape:
x_pred, mean, log_var, y_pred = self.call(x)
vae_loss_value = self.vae_loss(x, x_pred, mean, log_var)
property_loss_value = self.property_loss(y_true, y_pred)
total_loss = vae_loss_value + property_loss_value
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
gradients = tape.gradient(total_loss, self.trainable_variables)
optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return vae_loss_value, property_loss_value
latent_dim = 8
num_epochs = 50
batch_size = 256
vae = VAEWithRegressor(latent_dim)
x_train = x
y_train = y
for epoch in range(num_epochs):
epoch_vae_loss = 0
epoch_property_loss = 0
for i in range(0, len(x_train), batch_size):
x_batch = x_train[i:i+batch_size]
y_batch = y_train[i:i+batch_size]
vae_loss_value, property_loss_value = vae.train_step(x_batch, y_batch)
epoch_vae_loss += vae_loss_value
epoch_property_loss += property_loss_value
epoch_vae_loss /= (len(x_train) / batch_size)
epoch_property_loss /= (len(x_train) / batch_size)
print('Epoch {}, VAE loss: {}, Property loss: {}'.format(epoch+1, epoch_vae_loss, epoch_property_loss))
z_sample = vae.encoder.predict(x)[:,:latent_dim]
x_pred = np.array(vae.decoder.predict(z_sample))
y_pred = np.array(vae.predict_properties(z_sample))

BERT for multi label text classification always has a similar val accuracy despite fine tuning

I have a dataset of German news articles that I need to classify in my job. Since it is imbalanced, I am focussing on only 12 of 30 labels currently. Therefore I tried to balance the dataset by oversampling enhanced with data augmentation. Each sample can belong to multiple categories, thus it is a multi label problem.
The train dataset contains about 127.000 samples.
I am using a German BERT model with Tensorflow but despite fine tuning and even adding new layers, my val accuracy is always about 65%. Sometimes 67 to 68 but never higher. I wondered if my code is maybe broken or if it is due to the dataset.
Here is what I have right now:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
transformer_model = TFAutoModel.from_pretrained("dbmdz/bert-base-german-cased", output_hidden_states=False)
def multi_label_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
"""For multi-label classification, one has to define a custom
acccuracy function because neither tf.keras.metrics.Accuracy nor
tf.keras.metrics.CategoricalAccuracy evaluate the number of
exact matches.
:Example:
>>> from tensorflow.keras import metrics
>>> y_true = tf.convert_to_tensor([[1., 1.]])
>>> y_pred = tf.convert_to_tensor([[1., 0.]])
>>> metrics.Accuracy()(y_true, y_pred).numpy()
0.5
>>> metrics.CategoricalAccuracy()(y_true, y_pred).numpy()
1.0
>>> multi_label_accuracy(y_true, y_pred).numpy()
0.0
"""
y_pred = tf.math.sigmoid(y_pred)
y_pred = tf.math.round(y_pred)
exact_matches = tf.math.reduce_all(y_pred == y_true, axis=1)
exact_matches = tf.cast(exact_matches, tf.float32)
return tf.math.reduce_mean(exact_matches)
def f1_score(y_true, y_logit):
'''
Calculate F1 score
y_true: true value
y_logit: predicted value
'''
y_logit = tf.math.sigmoid(y_logit)
true_positives = K.sum(K.round(K.clip(y_true * y_logit, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
predicted_positives = K.sum(K.round(K.clip(y_logit, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return (2 * precision * recall) / (precision + recall + K.epsilon())
for l in transformer_model.layers:
l.trainable = True
bert = transformer_model.layers[0]
input_ids = tf.keras.layers.Input(shape=(60,), name='input_ids', dtype=np.int32)
attention_masks = tf.keras.layers.Input(shape=(60,), name='attention_masks', dtype=np.int32)
bert_model = bert(input_ids, attention_mask=attention_masks)[0][:, 0, :]
dropout = tf.keras.layers.Dropout(0.2, name="pooled_output")
pooled_output = dropout(bert_model)
dense = tf.keras.layers.Dense(units=256, activation="sigmoid")(pooled_output)
dropout2 = tf.keras.layers.Dropout(0.2)(dense)
dense2 = tf.keras.layers.Dense(units=64, activation="relu")(dropout2)
output = tf.keras.layers.Dense(units=12, name="output")(dense2)
model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
print("Compile model...", flush=True)
optimizer = Adam(learning_rate=1e-5, decay=1e-6)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=[f1_score, multi_label_accuracy]
)
history = model.fit([dataset['train']['bert'], dataset['train']['bert2']], dataset['train']['outputs'], epochs=4, batch_size=64, validation_data=([dataset['val']['bert'], dataset['val']['bert2']], dataset['val']['outputs']))
I would expect the val accuracy to change a lot more by changing the architecture of the model.

Normalized Cross Entropy Loss Implementation Tensorflow/Keras

I am trying to implement a normalized cross entropy loss as described in this publication
The math given is:
This paper provided a PyTorch implementation:
#mlconfig.register
class NormalizedCrossEntropy(torch.nn.Module):
def __init__(self, num_classes, scale=1.0):
super(NormalizedCrossEntropy, self).__init__()
self.device = device
self.num_classes = num_classes
self.scale = scale
def forward(self, pred, labels):
pred = F.log_softmax(pred, dim=1)
label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
nce = -1 * torch.sum(label_one_hot * pred, dim=1) / (- pred.sum(dim=1))
return self.scale * nce.mean()
But I need this to be translated to tensorflow for my ongoing project. Can anyone help me implement this normalized crossentropy loss in tensorflow?
I think is just a matter of translating methods name:
# given y_pred as 1-hot and y-true the multiclass probabilities
def NCE(y_true, y_pred):
num = - tf.math.reduce_sum(tf.multiply(y_true, y_pred), axis=1)
denom = -tf.math.reduce_sum(y_pred, axis=1)
return tf.reduce_mean(num / denom)
t = tf.constant([[1,0,0], [0,0,1]], dtype=tf.float64)
y = tf.constant([[0.3,0.6,0.1], [0.1,0.1,0.8]], dtype=tf.float64)
NCE(t,y)
# <tf.Tensor: shape=(), dtype=float64, numpy=0.55>
Just check if the resulting loss is the same since I've not tested it

Multi-class weighted loss for semantic image segmentation in keras/tensorflow

Given batched RGB images as input, shape=(batch_size, width, height, 3)
And a multiclass target represented as one-hot, shape=(batch_size, width, height, n_classes)
And a model (Unet, DeepLab) with softmax activation in last layer.
I'm looking for weighted categorical-cross-entropy loss funciton in kera/tensorflow.
The class_weight argument in fit_generator doesn't seems to work, and I didn't find the answer here or in https://github.com/keras-team/keras/issues/2115.
def weighted_categorical_crossentropy(weights):
# weights = [0.9,0.05,0.04,0.01]
def wcce(y_true, y_pred):
# y_true, y_pred shape is (batch_size, width, height, n_classes)
loos = ?...
return loss
return wcce
I will answer my question:
def weighted_categorical_crossentropy(weights):
# weights = [0.9,0.05,0.04,0.01]
def wcce(y_true, y_pred):
Kweights = K.constant(weights)
if not K.is_tensor(y_pred): y_pred = K.constant(y_pred)
y_true = K.cast(y_true, y_pred.dtype)
return K.categorical_crossentropy(y_true, y_pred) * K.sum(y_true * Kweights, axis=-1)
return wcce
Usage:
loss = weighted_categorical_crossentropy(weights)
optimizer = keras.optimizers.Adam(lr=0.01)
model.compile(optimizer=optimizer, loss=loss)
I'm using the Generalized Dice Loss. It works better than the Weighted Categorical Crossentropy in my case. My implementation is in PyTorch, however, it should be fairly easy to translate it.
class GeneralizedDiceLoss(nn.Module):
def __init__(self):
super(GeneralizedDiceLoss, self).__init__()
def forward(self, inp, targ):
inp = inp.contiguous().permute(0, 2, 3, 1)
targ = targ.contiguous().permute(0, 2, 3, 1)
w = torch.zeros((targ.shape[-1],))
w = 1. / (torch.sum(targ, (0, 1, 2))**2 + 1e-9)
numerator = targ * inp
numerator = w * torch.sum(numerator, (0, 1, 2))
numerator = torch.sum(numerator)
denominator = targ + inp
denominator = w * torch.sum(denominator, (0, 1, 2))
denominator = torch.sum(denominator)
dice = 2. * (numerator + 1e-9) / (denominator + 1e-9)
return 1. - dice
This issue might be similar to: Unbalanced data and weighted cross entropy which has an accepted answer.

Keras Loss Function with Additional Dynamic Parameter

I'm working on implementing prioritized experience replay for a deep-q network, and part of the specification is to multiply gradients by what's know as importance sampling (IS) weights. The gradient modification is discussed in section 3.4 of the following paper: https://arxiv.org/pdf/1511.05952.pdf I'm struggling with creating a custom loss function that takes in an array of IS weights in addition to y_true and y_pred.
Here's a simplified version of my model:
import numpy as np
import tensorflow as tf
# Input is RAM, each byte in the range of [0, 255].
in_obs = tf.keras.layers.Input(shape=(4,))
# Normalize the observation to the range of [0, 1].
norm = tf.keras.layers.Lambda(lambda x: x / 255.0)(in_obs)
# Hidden layers.
dense1 = tf.keras.layers.Dense(128, activation="relu")(norm)
dense2 = tf.keras.layers.Dense(128, activation="relu")(dense1)
dense3 = tf.keras.layers.Dense(128, activation="relu")(dense2)
dense4 = tf.keras.layers.Dense(128, activation="relu")(dense3)
# Output prediction, which is an action to take.
out_pred = tf.keras.layers.Dense(2, activation="linear")(dense4)
opt = tf.keras.optimizers.Adam(lr=5e-5)
network = tf.keras.models.Model(inputs=in_obs, outputs=out_pred)
network.compile(optimizer=opt, loss=huber_loss_mean_weighted)
Here's my custom loss function, which is just an implementation of Huber Loss multiplied by the IS weights:
'''
' Huber loss: https://en.wikipedia.org/wiki/Huber_loss
'''
def huber_loss(y_true, y_pred):
error = y_true - y_pred
cond = tf.keras.backend.abs(error) < 1.0
squared_loss = 0.5 * tf.keras.backend.square(error)
linear_loss = tf.keras.backend.abs(error) - 0.5
return tf.where(cond, squared_loss, linear_loss)
'''
' Importance Sampling weighted huber loss.
'''
def huber_loss_mean_weighted(y_true, y_pred, is_weights):
error = huber_loss(y_true, y_pred)
return tf.keras.backend.mean(error * is_weights)
The important bit is that is_weights is dynamic, i.e. it's different each time fit() is called. As such, I cannot simply close over is_weights as described here: Make a custom loss function in keras
I found this code online, which appears to use a Lambda layer to compute the loss: https://github.com/keras-team/keras/blob/master/examples/image_ocr.py#L475 It looks promising, but I'm struggling to understand it/adapt it to my particular problem. Any help is appreciated.
OK. Here is an example.
from keras.layers import Input, Dense, Conv2D, MaxPool2D, Flatten
from keras.models import Model
from keras.losses import categorical_crossentropy
def sample_loss( y_true, y_pred, is_weight ) :
return is_weight * categorical_crossentropy( y_true, y_pred )
x = Input(shape=(32,32,3), name='image_in')
y_true = Input( shape=(10,), name='y_true' )
is_weight = Input(shape=(1,), name='is_weight')
f = Conv2D(16,(3,3),padding='same')(x)
f = MaxPool2D((2,2),padding='same')(f)
f = Conv2D(32,(3,3),padding='same')(f)
f = MaxPool2D((2,2),padding='same')(f)
f = Conv2D(64,(3,3),padding='same')(f)
f = MaxPool2D((2,2),padding='same')(f)
f = Flatten()(f)
y_pred = Dense(10, activation='softmax', name='y_pred' )(f)
model = Model( inputs=[x, y_true, is_weight], outputs=y_pred, name='train_only' )
model.add_loss( sample_loss( y_true, y_pred, is_weight ) )
model.compile( loss=None, optimizer='sgd' )
print model.summary()
Note, since you've add loss through add_loss(), you don't have to do it through compile( loss=xxx ).
With regards to train a model, nothing is special except you move y_true to your input end. See below
import numpy as np
a = np.random.randn(8,32,32,3)
a_true = np.random.randn(8,10)
a_is_weight = np.random.randint(0,2,size=(8,1))
model.fit( [a, a_true, a_is_weight] )
Finally, you can make a testing model (which share all weights in model) for easier use, i.e.
test_model = Model( inputs=x, outputs=y_pred, name='test_only' )
a_pred = test_model.predict( a )