I am currently struggling with implementing a normalized binary cross entropy for semantic segmentation based on a normalized cross entropy in this publication (relevant pages are 2-3) as a custom loss function for TensorFlow/Keras.
Unfortunately, the PyTorch code given on GitHub (see code below) is not meant for semantic segmentation according to the authors comments.
#mlconfig.register
class NormalizedCrossEntropy(torch.nn.Module):
def __init__(self, num_classes, scale=1.0):
super(NormalizedCrossEntropy, self).__init__()
self.device = device
self.num_classes = num_classes
self.scale = scale
def forward(self, pred, labels):
pred = F.log_softmax(pred, dim=1)
label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
nce = -1 * torch.sum(label_one_hot * pred, dim=1) / (- pred.sum(dim=1))
return self.scale * nce.mean()
A contributor on GitHub tried to implement it for semantic segmentation, also in PyTorch, but I don't know if it is correct (see code below) or this link.
class NCELoss(torch.nn.Module):
def __init__(self, ignore_label, class_weight=None):
super(NCELoss, self).__init__()
self.ignore_label = ignore_label
def forward(self, pred, target, sample_weight=None):
target = target.long()
b, c, h, w = pred.shape
logsoftmax = F.log_softmax(pred, dim=1)
ohot = target
mask = torch.ones_like(target).float().cuda()
ohot[torch.where(target == self.ignore_label)] = 0
mask[torch.where(target == self.ignore_label)] = 0
ohot = torch.nn.functional.one_hot(ohot, c).reshape(b, c, h, w).float().cuda() # BCHW
nce = (-1 * torch.sum(ohot * logsoftmax, dim=1)) / (-1 * logsoftmax.sum(dim=1)) # BHW
nce = nce * mask # set weights of ignore label to 0
nce = torch.sum(nce) / target.ne(self.ignore_label).sum() # compute mean loss
return nce
Does anybody have an idea how this loss function could be correctly implemented in Tensorflow for the use of semantic segmentation with a U-Net FCN? Thanks everyone in advance!
Related
this is a similar question to: How to create a keras layer with a custom gradient in TF2.0?
Only, I would like to introduce a learnable parameter into the custom layer that I am training.
Here's a toy example of my current approach here:
# Method for calculation custom gradient
#tf.custom_gradient
def scaler(x, s):
def grad(upstream):
dy_dx = s
dy_ds = x
return dy_dx, dy_ds
return x * s, grad
# Keras Layer with trainable parameter
class TestLayer(tf.keras.layers.Layer):
def build(self, input_shape):
self.scale = self.add_weight("scale",
shape=[1,],
initializer=tf.keras.initializers.Constant(value=2.0),
trainable=True)
def call(self, inputs):
return scaler(inputs, self.scale)
# Creates Keras Model that uses the layer
def Model():
x_in = tf.keras.layers.Input(shape=(1,))
x_out = TestLayer()(x_in)
return tf.keras.Model(inputs=x_in, outputs=x_out, name="fp8_test")
# Create toy dataset, want to learn `scale` such to satisfy 5 = 2 * scale (i.e, `scale` should learn ~2.5)
def Dataset():
inps = tf.ones(shape=(10**5,)) * 2 # inputs
expected = tf.ones(shape=(10**5,)) * 5 # targets
data_in = tf.data.Dataset.from_tensors(inps)
data_exp = tf.data.Dataset.from_tensors(expected)
dataset = tf.data.Dataset.zip((data_in, data_exp))
return dataset
model = Model()
model.summary()
dataset = Dataset()
# Use `MSE` loss and `SGD` optimizer
model.compile(
loss=tf.keras.losses.MSE,
optimizer=tf.keras.optimizers.SGD(),
)
model.fit(dataset, epochs=100)
This is failing with the following shape related error in the optimizer:
ValueError: Shapes must be equal rank, but are 1 and 2 for '{{node SGD/SGD/update/ResourceApplyGradientDescent}} = ResourceApplyGradientDescent[T=DT_FLOAT, use_locking=true](fp8_test/test_layer_1/ReadVariableOp/resource, SGD/Identity, SGD/IdentityN)' with input shapes: [], [], [100000,1].
I've been staring at the docs for a while, I'm a bit stumped as to why this isn't working, I would really appreciate any input on how to fix this toy example.
Thanks in advance.
I'm trying to implement normalized binary cross entropy for a classification task following this paper: Normalized Loss Functions for Deep Learning with Noisy Labels.
The math is as follows:
Here is my implementation:
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
if self.from_logits:
logits = tf.math.sigmoid(logits)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
numer = target * tf.math.log(logits) + (1 - target) * tf.math.log(1 - logits)
denom = - (tf.math.log(logits) + tf.math.log(1 - logits))
return - numer / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I'm using this loss to train a binary classifier (CTR predictor), but loss of the model does not decrease and ROC-AUC remains at ~0.49-0.5. To verify the implementation of numerator, I tried training by removing the denominator and it's working fine.
# Example Usage
labels = np.array([[0], [1], [0], [0], [0]]).astype(np.int64)
logits = np.array([[-1.024], [2.506], [1.43], [0.004], [-2.0]]).astype(np.float64)
tf_nce = NormalizedBinaryCrossentropy(
reduction=tf.keras.losses.Reduction.NONE,
from_logits=True
)
tf_nce(labels, logits)
#<tf.Tensor: shape=(5, 1), dtype=float64, numpy=
# array([[0.18737159],
# [0.02945536],
# [0.88459308],
# [0.50144269],
# [0.05631594]])>
I checked manually with some extremes and that loss doesn't hit nans or 0s.
Can anyone help me in debugging why the model is not able to converge on this loss? Is there something wrong with my understanding of the loss function or implementation?
Edit 1: Model architecture is a Multi-Gate Mixture-of-Experts with 6 tasks. All 6 tasks are binary classification and losses from all tasks are added together to get final loss.
One thing which is mentioned in the paper as described above is that the Norm of the loss should be inclusively in between [0 ~ 1] but as your loss is violating this condition of Normalized Binary Cross Entropy and the other reason is you are dividing by the wrong denominator, you have to divide it by the Cross-Entropy of your logits for this take the BinaryCrossEntropy() of your logits. so, these can be the reasons that your function is not decreasing... I have made some changes to your code that satisfy this Norm Property...
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
if self.from_logits:
numer = tf.keras.losses.binary_crossentropy(target, logits,from_logits=True)[:,tf.newaxis]
denom = -( tf.math.log(logits) + tf.math.log(1 - logits))
return numer * denom / tf.reduce_sum(denom)
else:
logits = tf.nn.log_softmax(logits)
num = - tf.math.reduce_sum(tf.multiply(target, logits), axis=1)
denom = -tf.math.reduce_sum(logits, axis=1)
return num / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I have updated the solution, there are two ways for computing the BCE if your logits are one-hot then set from_logit=False else set it True.
I would try to avoid log-Sigmoid stability issues and try to implement the above model as a 2 class problem with Softmax Binary cross entropy..
The NormalizedCrossEntropy is defined as:
class NormalizedCrossEntropy(keras.layers.Layer):
def __init__(self, num_classes):
super(NormalizedCrossEntropy, self).__init__()
self.num_classes = num_classes
def call(self, pred, labels):
pred = tf.nn.log_softmax(pred, axis=1,)
label_one_hot = tf.one_hot(labels, self.num_classes)
numer = -1 * tf.reduce_sum(label_one_hot * pred, axis=1)
denom = -1* tf.reduce_sum(pred, axis=1)
nce = numer/ denom
return nce
Example usage:
NormalizedCrossEntropy(num_classes=2)(np.array([[-1.024, 0.5], [0.1, 2.506], [1, .0], [0., 1.], [-0.89, -2.0]]), np.array([0, 1, 0, 0, 0]) )
#array([0.89725673, 0.03348167, 0.19259584, 0.80740416, 0.16958274]
I have a deep sarsa algorithm which work great on Pytorch on lunar-lander-v2 and I would use with Keras/Tensorflow. It use mini-batch of size 64 which are used 128 time to train at each episode.
There are the results I get. As you can see, it work great with Pytorch but not with Keras / Tensorflow... So I think I do not correctly implement the training function is Keras/Tensorflow (code is below).
It seems that loss is oscillating in Keras because epsilon go to early to slow value but it work very great in Pytorch...
Do you see something that could explain why it do not work in Keras/Tensorflow please? Thanks a lot for your help and any idea that could help me...
Network information:
It use Adam optimizer, and a network with two layers : 256 and 128, with relu on each:
class Q_Network(nn.Module):
def __init__(self, state_dim , action_dim):
super(Q_Network, self).__init__()
self.x_layer = nn.Linear(state_dim, 256)
self.h_layer = nn.Linear(256, 128)
self.y_layer = nn.Linear(128, action_dim)
print(self.x_layer)
def forward(self, state):
xh = F.relu(self.x_layer(state))
hh = F.relu(self.h_layer(xh))
state_action_values = self.y_layer(hh)
return state_action_values
For keras/Tensorflwo I use this one:
def CreationModele(dimension):
entree_etat = keras.layers.Input(shape=(dimension))
sortie = keras.layers.Dense(units=256, activation='relu')(entree_etat)
sortie = keras.layers.Dense(units=128, activation='relu')(sortie)
sortie = keras.layers.Dense(units=4)(sortie)
modele = keras.Model(inputs=entree_etat,outputs=sortie)
return modele
Training code
In Pytorch, the training is done by:
def update_Sarsa_Network(self, state, next_state, action, next_action, reward, ends):
actions_values = torch.gather(self.qnet(state), dim=1, index=action.long())
next_actions_values = torch.gather(self.qnet(next_state), dim=1, index=next_action.long())
next_actions_values = reward + (1.0 - ends) * (self.discount_factor * next_actions_values)
q_network_loss = self.MSELoss_function(actions_values, next_actions_values.detach())
self.qnet_optim.zero_grad()
q_network_loss.backward()
self.qnet_optim.step()
return q_network_loss
And in Keras/Tensorflow by:
mse = keras.losses.MeanSquaredError(
reduction=keras.losses.Reduction.SUM)
#tf.function
def train(model, batch_next_states_tensor, batch_next_actions_tensor, batch_reward_tensor, batch_end_tensor, batch_states_tensor, batch_actions_tensor, optimizer, gamma):
with tf.GradientTape() as tape:
# EStimation des valeurs des actions courantes
actions_values = model(batch_states_tensor) # (mini_batch_size,4)
actions_values = tf.linalg.diag_part(tf.gather(actions_values,batch_actions_tensor,axis=1)) # (mini_batch_size,)
actions_values = tf.expand_dims(actions_values,-1) # (mini_batch_size,1)
# EStimation des valeurs des actions suivantes
next_actions_values = model(batch_next_states_tensor) # (mini_batch_size,4)
next_actions_values = tf.linalg.diag_part(tf.gather(next_actions_values,batch_next_actions_tensor,axis=1)) # (mini_batch_size,)
cibles = batch_reward_tensor + (1.0 - batch_end_tensor)*gamma*tf.expand_dims(next_actions_values,-1) # (mini_batch_size,1)
error = mse(cibles, actions_values)
grads = tape.gradient(error, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return error
Error function and Optimizer code
The optimizer is Adam in Pytorch and Tensorflow with lr=0.001. In Pytorch:
def __init__(self, state_dim, action_dim):
self.qnet = Q_Network(state_dim, action_dim)
self.qnet_optim = torch.optim.Adam(self.qnet.parameters(), lr=0.001)
self.discount_factor = 0.99
self.MSELoss_function = nn.MSELoss(reduction='sum')
self.replay_buffer = ReplayBuffer()
pass
In Keras / Tensorflow:
alpha = 1e-3
# Initialise le modèle
modele_Keras = CreationModele(8)
optimiseur_Keras = keras.optimizers.Adam(learning_rate=alpha)
Ok I finnaly foud a solution by de-correlate target and action value using two model, one being updated periodically for target values calculation.
I use a model for estimating the epsilon-greedy actions and computing the Q(s,a) values and a fixed model (but periodically uptated with the weight of the previous model) for calculate the targer r+gamma*Q(s',a').
Here is my result :
I am trying to implement a normalized cross entropy loss as described in this publication
The math given is:
This paper provided a PyTorch implementation:
#mlconfig.register
class NormalizedCrossEntropy(torch.nn.Module):
def __init__(self, num_classes, scale=1.0):
super(NormalizedCrossEntropy, self).__init__()
self.device = device
self.num_classes = num_classes
self.scale = scale
def forward(self, pred, labels):
pred = F.log_softmax(pred, dim=1)
label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
nce = -1 * torch.sum(label_one_hot * pred, dim=1) / (- pred.sum(dim=1))
return self.scale * nce.mean()
But I need this to be translated to tensorflow for my ongoing project. Can anyone help me implement this normalized crossentropy loss in tensorflow?
I think is just a matter of translating methods name:
# given y_pred as 1-hot and y-true the multiclass probabilities
def NCE(y_true, y_pred):
num = - tf.math.reduce_sum(tf.multiply(y_true, y_pred), axis=1)
denom = -tf.math.reduce_sum(y_pred, axis=1)
return tf.reduce_mean(num / denom)
t = tf.constant([[1,0,0], [0,0,1]], dtype=tf.float64)
y = tf.constant([[0.3,0.6,0.1], [0.1,0.1,0.8]], dtype=tf.float64)
NCE(t,y)
# <tf.Tensor: shape=(), dtype=float64, numpy=0.55>
Just check if the resulting loss is the same since I've not tested it
We're implementing a paper titled - "Variational Autoencoders for Collaborative Filtering" in TF 2.0.
The sample implementation of the above paper in TF 1.0 is given here.
The paper proposes an implementation of a Variational Autoencoder for collaborative filtering. As the output of the encoder, it uses the reparametrization trick to sample the latent vector Z at the time of training the network.
The reparametrization trick samples ϵ ∼ N (0, IK) and reparametrize the latent vector Z as:
Zu = µϕ(xu ) + ϵ ⊙ σϕ(xu) where µϕ and σϕ are calculated from the output of the encoder.
But, at the time of prediction, the paper proposes to use only µϕ for sampling Z.
In our implementation, we used a custom tf.keras.layers.Layer to sample the latent vector Z. The following is the code of the architecture:
class Reparameterize(tf.keras.layers.Layer):
"""
Custom layer.
Reparameterization trick, sample random latent vectors Z from
the latent Gaussian distribution.
The sampled vector Z is given by
sampled_z = mean + std * epsilon
"""
def call(self, inputs):
Z_mu, Z_logvar = inputs
Z_sigma = tf.math.exp(0.5 * Z_logvar)
epsilon = tf.random.normal(tf.shape(Z_sigma))
return Z_mu + Z_sigma * epsilon
class VAE:
def __init__(self, input_dim, latent_dim=200):
# encoder
encoder_input = Input(shape=input_dim)
X = tf.math.l2_normalize(encoder_input, 1)
X = Dropout(0.5)(X)
X = Dense(600, activation='tanh')(X)
Z_mu = Dense(latent_dim)(X)
Z_logvar = Dense(latent_dim)(X)
sampled_Z = Reparameterize()([Z_mu, Z_logvar])
# decoder
decoder_input = Input(shape=latent_dim)
X = Dense(600, activation='tanh')(decoder_input)
logits = Dense(input_dim)(X)
# define losses
"""
custom loss function
def loss(X_true, X_pred)
"""
# create models
self.encoder = Model(encoder_input, [Z_logvar, Z_mu, sampled_Z], name='encoder')
self.decoder = Model(decoder_input, logits, name='decoder')
self.vae = Model(encoder_input, self.decoder(sampled_Z), name='vae')
self.vae.add_loss(kl_divergence(Z_logvar, Z_mu))
# compile the model
self.vae.compile(optimizer='adam', loss=loss, metrics=[loss])
Now, I am looking for a way to change the implementation of the custom Reparameterize layer at the time of prediction to use only µϕ (Z_mu) for sampling Z so as to achieve what is proposed by the paper mentioned above.
Or if there's another way of doing so in Tf 2.0, kindly recommend.
You could do:
# create your VAE model
my_vae = VAE(input_dim = my_input_dim)
# Train it as you wish
# .....
When training is done, you could use it as follows:
inp = Input(shape = my_input_dim)
_, Z_mu,_ = my_vae.encoder(inp) # my_vae is your trained model, get its outputs
decoder_output = my_vae.decoder(Z_mu) # use the Z_mu as input to decoder
vae_predictor = Model(inp, decoder_output) # create your prediction time model
You could use the vae_predictor model now for predictions.