In 'tensorflow unable to take 'log' - tensorflow

I am working on CapsNet and taking code help from here. Simulation is performed on google colab with tensorflow = 2.4.0. I am getting following error:
AttributeError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:805 train_function *
return step_function(self, iterator)
/content/drive/My Drive/Cervical GAN/Segmentation/Cheng-Lin-Li/SegCaps-master-aashish/utils/custom_losses.py:102 dice_loss *
return 1-dice_soft(y_true, y_pred, from_logits=False)
/content/drive/My Drive/Cervical GAN/Segmentation/Cheng-Lin-Li/SegCaps-master-aashish/utils/custom_losses.py:41 dice_soft *
y_pred = tf.log(y_pred / (1 - y_pred))
AttributeError: module 'tensorflow' has no attribute 'log'
Following is custom_losses.py
'''
Capsules for Object Segmentation (SegCaps)
Original Paper: https://arxiv.org/abs/1804.04241
Code written by: Rodney LaLonde
If you use significant portions of this code or the ideas from our paper, please cite it :)
If you have any questions, please email me at lalonde#knights.ucf.edu.
This file contains the definitions of custom loss functions not present in the default Keras.
=====
This program includes all custom loss functions UNet, tiramisu, Capsule Nets (capsbasic) or SegCaps(segcapsr1 or segcapsr3).
#author: Cheng-Lin Li a.k.a. Clark
#copyright: 2018 Cheng-Lin Li#Insight AI. All rights reserved.
#license: Licensed under the Apache License v2.0. http://www.apache.org/licenses/
#contact: clark.cl.li#gmail.com
Enhancement:
1. Revise default loss_type to jaccard on dice_soft function.
2. add bce_dice_loss for future usage.
'''
import tensorflow as tf
from keras import backend as K
from keras.losses import binary_crossentropy
def dice_soft(y_true, y_pred, loss_type='jaccard', axis=[1,2,3], smooth=1e-5, from_logits=False):
"""Soft dice (Sørensen or Jaccard) coefficient for comparing the similarity
of two batch of data, usually be used for binary image segmentation
i.e. labels are binary. The coefficient between 0 to 1, 1 means totally match.
Parameters
-----------
y_pred : tensor
A distribution with shape: [batch_size, ....], (any dimensions).
y_true : tensor
A distribution with shape: [batch_size, ....], (any dimensions).
loss_type : string
``jaccard`` or ``sorensen``, default is ``jaccard``.
axis : list of integer
All dimensions are reduced, default ``[1,2,3]``.
smooth : float
This small value will be added to the numerator and denominator.
If both y_pred and y_true are empty, it makes sure dice is 1.
If either y_pred or y_true are empty (all pixels are background), dice = ```smooth/(small_value + smooth)``,
then if smooth is very small, dice close to 0 (even the image values lower than the threshold),
so in this case, higher smooth can have a higher dice.
Examples
---------
>>> outputs = tl.act.pixel_wise_softmax(network.outputs)
>>> dice_loss = 1 - tl.cost.dice_coe(outputs, y_)
References
-----------
- `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
"""
if not from_logits:
# transform back to logits
_epsilon = tf.convert_to_tensor(1e-7, y_pred.dtype.base_dtype)
y_pred = tf.clip_by_value(y_pred, _epsilon, 1 - _epsilon)
y_pred = tf.log(y_pred / (1 - y_pred))
inse = tf.reduce_sum(y_pred * y_true, axis=axis)
if loss_type == 'jaccard':
l = tf.reduce_sum(y_pred * y_pred, axis=axis)
r = tf.reduce_sum(y_true * y_true, axis=axis)
elif loss_type == 'sorensen':
l = tf.reduce_sum(y_pred, axis=axis)
r = tf.reduce_sum(y_true, axis=axis)
else:
raise Exception("Unknow loss_type")
## old axis=[0,1,2,3]
# dice = 2 * (inse) / (l + r)
# epsilon = 1e-5
# dice = tf.clip_by_value(dice, 0, 1.0-epsilon) # if all empty, dice = 1
## new haodong
dice = (2. * inse + smooth) / (l + r + smooth)
##
dice = tf.reduce_mean(dice)
return dice
def dice_hard(y_true, y_pred, threshold=0.5, axis=[1,2,3], smooth=1e-5):
"""Non-differentiable Sørensen–Dice coefficient for comparing the similarity
of two batch of data, usually be used for binary image segmentation i.e. labels are binary.
The coefficient between 0 to 1, 1 if totally match.
Parameters
-----------
y_pred : tensor
A distribution with shape: [batch_size, ....], (any dimensions).
y_true : tensor
A distribution with shape: [batch_size, ....], (any dimensions).
threshold : float
The threshold value to be true.
axis : list of integer
All dimensions are reduced, default ``[1,2,3]``.
smooth : float
This small value will be added to the numerator and denominator, see ``dice_coe``.
References
-----------
- `Wiki-Dice <https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient>`_
"""
y_pred = tf.cast(y_pred > threshold, dtype=tf.float32)
y_true = tf.cast(y_true > threshold, dtype=tf.float32)
inse = tf.reduce_sum(tf.multiply(y_pred, y_true), axis=axis)
l = tf.reduce_sum(y_pred, axis=axis)
r = tf.reduce_sum(y_true, axis=axis)
## old axis=[0,1,2,3]
# hard_dice = 2 * (inse) / (l + r)
# epsilon = 1e-5
# hard_dice = tf.clip_by_value(hard_dice, 0, 1.0-epsilon)
## new haodong
hard_dice = (2. * inse + smooth) / (l + r + smooth)
##
hard_dice = tf.reduce_mean(hard_dice)
return hard_dice
def dice_loss(y_true, y_pred, from_logits=False):
return 1-dice_soft(y_true, y_pred, from_logits=False)
def bce_dice_loss(y_true, y_pred):
return binary_crossentropy(y_true, y_pred) + dice_loss(y_true, y_pred)
def weighted_binary_crossentropy_loss(pos_weight):
# pos_weight: A coefficient to use on the positive examples.
def weighted_binary_crossentropy(target, output, from_logits=False):
"""Binary crossentropy between an output tensor and a target tensor.
# Arguments
target: A tensor with the same shape as `output`.
output: A tensor.
from_logits: Whether `output` is expected to be a logits tensor.
By default, we consider that `output`
encodes a probability distribution.
# Returns
A tensor.
"""
# Note: tf.nn.sigmoid_cross_entropy_with_logits
# expects logits, Keras expects probabilities.
if not from_logits:
# transform back to logits
_epsilon = tf.convert_to_tensor(1e-7, output.dtype.base_dtype)
output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
output = tf.log(output / (1 - output))
return tf.nn.weighted_cross_entropy_with_logits(targets=target,
logits=output,
pos_weight=pos_weight)
return weighted_binary_crossentropy
def margin_loss(margin=0.4, downweight=0.5, pos_weight=1.0):
'''
Args:
margin: scalar, the margin after subtracting 0.5 from raw_logits.
downweight: scalar, the factor for negative cost.
'''
def _margin_loss(labels, raw_logits):
"""Penalizes deviations from margin for each logit.
Each wrong logit costs its distance to margin. For negative logits margin is
0.1 and for positives it is 0.9. First subtract 0.5 from all logits. Now
margin is 0.4 from each side.
Args:
labels: tensor, one hot encoding of ground truth.
raw_logits: tensor, model predictions in range [0, 1]
Returns:
A tensor with cost for each data point of shape [batch_size].
"""
logits = raw_logits - 0.5
positive_cost = pos_weight * labels * tf.cast(tf.less(logits, margin),
tf.float32) * tf.pow(logits - margin, 2)
negative_cost = (1 - labels) * tf.cast(
tf.greater(logits, -margin), tf.float32) * tf.pow(logits + margin, 2)
return 0.5 * positive_cost + downweight * 0.5 * negative_cost
return _margin_loss
The above comes while using dice loss. When using bce loss there is no error. I have tried tf.math.log instead of tf.log but still getting following error:
TypeError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:805 train_function *
return step_function(self, iterator)
/content/drive/MyDrive/Cervical GAN/Segmentation/Cheng-Lin-Li/SegCaps-master-aashish/utils/custom_losses.py:102 dice_loss *
return 1-dice_soft(y_true, y_pred, from_logits=False)
/content/drive/MyDrive/Cervical GAN/Segmentation/Cheng-Lin-Li/SegCaps-master-aashish/utils/custom_losses.py:43 dice_soft *
inse = tf.reduce_sum(y_pred * y_true, axis=axis)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:1180 binary_op_wrapper
raise e
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:1164 binary_op_wrapper
return func(x, y, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:1496 _mul_dispatch
return multiply(x, y, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper
return target(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:518 multiply
return gen_math_ops.mul(x, y, name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_math_ops.py:6078 mul
"Mul", x=x, y=y, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:558 _apply_op_helper
inferred_from[input_arg.type_attr]))
TypeError: Input 'y' of 'Mul' Op has type uint8 that does not match type float32 of argument 'x'.

The error
TypeError: Input 'y' of 'Mul' Op has type uint8 that does not match type float32 of argument 'x'.
indicates that y does not match the type of x in x * y. This can be fixed by casting to tf.float32.
The problem arises in this line in dice_soft:
inse = tf.reduce_sum(y_pred * y_true, axis=axis)
So one solution is to use tf.cast to cast y_true to the same type as y_pred.

Related

Model with normalized binary cross entropy loss does not converge

I'm trying to implement normalized binary cross entropy for a classification task following this paper: Normalized Loss Functions for Deep Learning with Noisy Labels.
The math is as follows:
Here is my implementation:
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
if self.from_logits:
logits = tf.math.sigmoid(logits)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
numer = target * tf.math.log(logits) + (1 - target) * tf.math.log(1 - logits)
denom = - (tf.math.log(logits) + tf.math.log(1 - logits))
return - numer / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I'm using this loss to train a binary classifier (CTR predictor), but loss of the model does not decrease and ROC-AUC remains at ~0.49-0.5. To verify the implementation of numerator, I tried training by removing the denominator and it's working fine.
# Example Usage
labels = np.array([[0], [1], [0], [0], [0]]).astype(np.int64)
logits = np.array([[-1.024], [2.506], [1.43], [0.004], [-2.0]]).astype(np.float64)
tf_nce = NormalizedBinaryCrossentropy(
reduction=tf.keras.losses.Reduction.NONE,
from_logits=True
)
tf_nce(labels, logits)
#<tf.Tensor: shape=(5, 1), dtype=float64, numpy=
# array([[0.18737159],
# [0.02945536],
# [0.88459308],
# [0.50144269],
# [0.05631594]])>
I checked manually with some extremes and that loss doesn't hit nans or 0s.
Can anyone help me in debugging why the model is not able to converge on this loss? Is there something wrong with my understanding of the loss function or implementation?
Edit 1: Model architecture is a Multi-Gate Mixture-of-Experts with 6 tasks. All 6 tasks are binary classification and losses from all tasks are added together to get final loss.
One thing which is mentioned in the paper as described above is that the Norm of the loss should be inclusively in between [0 ~ 1] but as your loss is violating this condition of Normalized Binary Cross Entropy and the other reason is you are dividing by the wrong denominator, you have to divide it by the Cross-Entropy of your logits for this take the BinaryCrossEntropy() of your logits. so, these can be the reasons that your function is not decreasing... I have made some changes to your code that satisfy this Norm Property...
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
if self.from_logits:
numer = tf.keras.losses.binary_crossentropy(target, logits,from_logits=True)[:,tf.newaxis]
denom = -( tf.math.log(logits) + tf.math.log(1 - logits))
return numer * denom / tf.reduce_sum(denom)
else:
logits = tf.nn.log_softmax(logits)
num = - tf.math.reduce_sum(tf.multiply(target, logits), axis=1)
denom = -tf.math.reduce_sum(logits, axis=1)
return num / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I have updated the solution, there are two ways for computing the BCE if your logits are one-hot then set from_logit=False else set it True.
I would try to avoid log-Sigmoid stability issues and try to implement the above model as a 2 class problem with Softmax Binary cross entropy..
The NormalizedCrossEntropy is defined as:
class NormalizedCrossEntropy(keras.layers.Layer):
def __init__(self, num_classes):
super(NormalizedCrossEntropy, self).__init__()
self.num_classes = num_classes
def call(self, pred, labels):
pred = tf.nn.log_softmax(pred, axis=1,)
label_one_hot = tf.one_hot(labels, self.num_classes)
numer = -1 * tf.reduce_sum(label_one_hot * pred, axis=1)
denom = -1* tf.reduce_sum(pred, axis=1)
nce = numer/ denom
return nce
Example usage:
NormalizedCrossEntropy(num_classes=2)(np.array([[-1.024, 0.5], [0.1, 2.506], [1, .0], [0., 1.], [-0.89, -2.0]]), np.array([0, 1, 0, 0, 0]) )
#array([0.89725673, 0.03348167, 0.19259584, 0.80740416, 0.16958274]

Correct Implementation of Dice Loss in Tensorflow / Keras

I've been trying to experiment with Region Based: Dice Loss but there have been a lot of variations on the internet to a varying degree that I could not find two identical implementations. The problem is that all of these produce varying results. Below are the implementations that I found. Some uses smoothing factor which the authors in this paper have called epsilon, some use it in both numerator and denominator, one implementation used Gamma etc etc.
Could someone please help me with the correct implementation.
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
def dice_loss1(y_true, y_pred, smooth=1e-6):
'''
https://www.kaggle.com/code/bigironsphere/loss-function-library-keras-pytorch/notebook
'''
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
smooth = tf.cast(smooth, y_pred.dtype)
y_pred = K.flatten(y_pred)
y_true = K.flatten(y_true)
intersection = K.sum(K.dot(y_true, y_pred))
dice_coef = (2*intersection + smooth) / (K.sum(y_true) + K.sum(y_pred) + smooth)
dice_loss = 1-dice_coef
return dice_loss
def dice_loss2(y_true, y_pred, smooth=1e-6): # Only Smooth
"""
https://gist.github.com/wassname/7793e2058c5c9dacb5212c0ac0b18a8a
"""
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
smooth = tf.cast(smooth, y_pred.dtype)
intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
dice_coef = (2. * intersection + smooth) / (K.sum(K.square(y_true),-1) + K.sum(K.square(y_pred),-1) + smooth)
return 1- dice_coef
def dice_loss3(y_true, y_pred): # No gamma, no smooth
'''
https://lars76.github.io/2018/09/27/loss-functions-for-segmentation.html
'''
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
y_pred = tf.math.sigmoid(y_pred)
numerator = 2 * tf.reduce_sum(y_true * y_pred)
denominator = tf.reduce_sum(y_true + y_pred)
return 1 - numerator / denominator
def dice_loss4(y_true, y_pred, smooth=1e-6, gama=1): # Gama + Smooth is used
'''
https://dev.to/_aadidev/3-common-loss-functions-for-image-segmentation-545o
'''
y_pred = tf.convert_to_tensor(y_pred)
y_true = tf.cast(y_true, y_pred.dtype)
smooth = tf.cast(smooth, y_pred.dtype)
gama = tf.cast(gama, y_pred.dtype)
nominator = 2 * tf.reduce_sum(tf.multiply(y_pred, y_true)) + smooth
denominator = tf.reduce_sum(y_pred ** gama) + tf.reduce_sum(y_true ** gama) + smooth
result = 1 - tf.divide(nominator, denominator)
return result
y_true = np.array([[0,0,1,0],
[0,0,1,0],
[0,0,1.,0.]])
y_pred = np.array([[0,0,0.9,0],
[0,0,0.1,0],
[1,1,0.1,1.]])
# print(dice_loss1(y_true, y_pred)) # Gives you error in K.dot()
print(dice_loss2(y_true, y_pred))
print(dice_loss3(y_true, y_pred)) # provides array of values
print(dice_loss4(y_true, y_pred))
I utilized a variation of the dice loss for brain tumor segmentation. The implementation for the dice coefficient which I used for such results was:
def dice_coef(y_true, y_pred, smooth=100):
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
dice = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return dice
In order to make it a loss, it needs to be made into a function we want to minimize. This can be accomplished by making it negative:
def dice_coef_loss(y_true, y_pred):
return -dice_coef(y_true, y_pred)
or subtracting it from 1:
def dice_coef_loss(y_true, y_pred):
return 1 - dice_coef(y_true, y_pred)
or applying some other function then negating - for example, taking the negative logarithm (which could smooth the gradients):
def dice_coef_loss(y_true, y_pred):
return -K.log(dice_coef(y_true, y_pred))
The variable smooth represents your observation in other implementations with various names (smoothing, epsilon, etc.). Just for clarity, this smoothing variable exists to handle the case where the ground truth has very few white (or no) white pixels (assuming white pixels belonging to a class or boundary of an object, depending on your implementation).
If smooth is set too low, when the ground truth has few to 0 white pixels and the predicted image has some non-zero number of white pixels, the model will be penalized more heavily. Setting smooth higher means if the predicted image has some low amount of white pixels when the ground truth has none, the loss value will be lower. Depending on how aggressive the model needs to be, though, maybe a lower value is good.
Here's an illustrative example:
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
def dice_coef(y_true, y_pred, smooth):
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
dice = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
return dice
def dice_coef_loss(y_true, y_pred, smooth):
return 1 - dice_coef(y_true, y_pred, smooth)
if __name__ == '__main__':
smooth = 10e-6
y_pred = np.zeros((1, 128, 128))
# one pixel is set to 1
y_pred[0, 0, 0] = 1
y_pred = tf.convert_to_tensor(y_pred, dtype=tf.float32)
y_true = tf.zeros((1, 128, 128), dtype=tf.float32)
print(dice_coef(y_true, y_pred, smooth=smooth))
print(dice_coef_loss(y_true, y_pred, smooth=smooth))
will print out:
tf.Tensor(9.9999e-06, shape=(), dtype=float32)
tf.Tensor(0.99999, shape=(), dtype=float32)
But if smooth is set to 100:
tf.Tensor(0.990099, shape=(), dtype=float32)
tf.Tensor(0.009900987, shape=(), dtype=float32)
Showing the loss reduces to 0.009 instead of 0.99.
For completeness, if you have multiple segmentation channels (B X W X H X K, where B is the batch size, W and H are the dimensions of your image, and K are the different segmentations channels), the same concepts apply, but it can be implemented as follows:
def dice_coef_multilabel(y_true, y_pred, M, smooth):
dice = 0
for index in range(M):
dice += dice_coef(y_true[:,:,:,index], y_pred[:,:,:,index], smooth)
return dice
And it can be converted to a loss function through negation or subtraction, in the same way as dice_coef is. smooth could also be tuned per channel, if you supply a list or some other sequence (e.g; smooth_list):
def dice_coef_multilabel(y_true, y_pred, M, smooth_list):
dice = 0
for index in range(M):
dice += dice_coef(y_true[:,:,:,index], y_pred[:,:,:,index], smooth_list[index])
return dice

Multi-class weighted loss for semantic image segmentation in keras/tensorflow

Given batched RGB images as input, shape=(batch_size, width, height, 3)
And a multiclass target represented as one-hot, shape=(batch_size, width, height, n_classes)
And a model (Unet, DeepLab) with softmax activation in last layer.
I'm looking for weighted categorical-cross-entropy loss funciton in kera/tensorflow.
The class_weight argument in fit_generator doesn't seems to work, and I didn't find the answer here or in https://github.com/keras-team/keras/issues/2115.
def weighted_categorical_crossentropy(weights):
# weights = [0.9,0.05,0.04,0.01]
def wcce(y_true, y_pred):
# y_true, y_pred shape is (batch_size, width, height, n_classes)
loos = ?...
return loss
return wcce
I will answer my question:
def weighted_categorical_crossentropy(weights):
# weights = [0.9,0.05,0.04,0.01]
def wcce(y_true, y_pred):
Kweights = K.constant(weights)
if not K.is_tensor(y_pred): y_pred = K.constant(y_pred)
y_true = K.cast(y_true, y_pred.dtype)
return K.categorical_crossentropy(y_true, y_pred) * K.sum(y_true * Kweights, axis=-1)
return wcce
Usage:
loss = weighted_categorical_crossentropy(weights)
optimizer = keras.optimizers.Adam(lr=0.01)
model.compile(optimizer=optimizer, loss=loss)
I'm using the Generalized Dice Loss. It works better than the Weighted Categorical Crossentropy in my case. My implementation is in PyTorch, however, it should be fairly easy to translate it.
class GeneralizedDiceLoss(nn.Module):
def __init__(self):
super(GeneralizedDiceLoss, self).__init__()
def forward(self, inp, targ):
inp = inp.contiguous().permute(0, 2, 3, 1)
targ = targ.contiguous().permute(0, 2, 3, 1)
w = torch.zeros((targ.shape[-1],))
w = 1. / (torch.sum(targ, (0, 1, 2))**2 + 1e-9)
numerator = targ * inp
numerator = w * torch.sum(numerator, (0, 1, 2))
numerator = torch.sum(numerator)
denominator = targ + inp
denominator = w * torch.sum(denominator, (0, 1, 2))
denominator = torch.sum(denominator)
dice = 2. * (numerator + 1e-9) / (denominator + 1e-9)
return 1. - dice
This issue might be similar to: Unbalanced data and weighted cross entropy which has an accepted answer.

Weighted Pixel Wise Categorical Cross Entropy for Semantic Segmentation

I have recently started learning about Semantic Segmentation. I am trying to train a UNet for the same. My input is RGB 128x128x3 images. My masks are made up of 4 classes 0, 1, 2, 3 and are One-Hot Encoded with dimension 128x128x4.
def weighted_cce(y_true, y_pred):
weights = []
t_inf = tf.convert_to_tensor(1e9, dtype = 'float32')
t_zero = tf.convert_to_tensor(0, dtype = 'int64')
for i in range(0, 4):
l = tf.argmax(y_true, axis = -1) == i
n = tf.cast(tf.math.count_nonzero(l), 'float32') + K.epsilon()
weights.append(n)
weights = [batch_size/j for j in weights]
y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
# clip to prevent NaN's and Inf's
y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
# calc
loss = y_true * K.log(y_pred) * weights
loss = -K.sum(loss, -1)
return loss
This is the loss function that I am using but it classifies every pixel as 2. What am I doing wrong?
You should have weights based on you entire data (unless your batch size is reasonably big so you have sort of stable weights).
If some class is underrepresented, with a small batch size, it will have near infinity weights.
If your target data is numpy array:
shp = y_train.shape
totalPixels = shp[0] * shp[1] * shp[2]
weights = np.sum(y_train, axis=(0, 1, 2)) #final shape (4,)
weights = totalPixels/weights
If your data is in a Sequence generator:
totalPixels = 0
counts = np.zeros((4,))
for i in range(len(generator)):
x, y = generator[i]
shp = y.shape
totalPixels += shp[0] * shp[1] * shp[2]
counts = counts + np.sum(y, axis=(0,1,2))
weights = totalPixels / counts
If your data is in a yield generator (you must know how many batches you have in an epoch):
for i in range(batches_per_epoch):
x, y = next(generator)
#the rest is equal to the Sequence example above
Attempt 1
I don't know if newer versions of Keras are able to handle this, but you can try the simplest approach first: simply call fit or fit_generator with the class_weight argument:
model.fit(...., class_weight = {0: weights[0], 1: weights[1], 2: weights[2], 3: weights[3]})
Attempt 2
Make a healthier loss function:
weights = weights.reshape((1,1,1,4))
kWeights = K.constant(weights)
def weighted_cce(y_true, y_pred):
yWeights = kWeights * y_pred #shape (batch, 128, 128, 4)
yWeights = K.sum(yWeights, axis=-1) #shape (batch, 128, 128)
loss = K.categorical_crossentropy(y_true, y_pred) #shape (batch, 128, 128)
wLoss = yWeights * loss
return K.sum(wLoss, axis=(1,2))

Correlation-based loss function for sequence labelling in Keras

I have a question concerning the implementation of a correlation-based loss function for a sequence labelling task in Keras (Tensorflow backend).
Consider we have a sequence labelling problem, e.g., the input is a tensor of shape (20,100,5), the output is a tensor of shape (20,100,1).
In the documentation it is written that, the loss function needs to return a "scalar for each data point". What the default MSE loss does for the loss between tensors of shape (20,100,1) is to return a loss tensor of shape (20,100).
Now, if we use a loss function based on the correlation coefficient for each sequence, in theory, we will get only a single value for each sequence, i.e., a tensor of shape (20,).
However, using this in Keras as a loss function, fit() returns an error as a tensor of shape (20,100) is expected.
On the other side, there is no error when I either
Return just the mean value of the tensor (a single scalar for the whole data), or
Repeat the tensor (using K.repeat_elements) ending up in a tensor of shape (20,100).
The framework does not return an error (Tensorflow backend) and the loss is reduced over epochs, also on independent test data, the performance is good.
My questions are:
Which dimensionality of the targets/losses does the "fit" function usually assume in case of sequences?
Is the Tensorflow backend able to derive the gradients properly also with only the mean value returned?
Please find below an executable example with my implementations of correlation-based loss functions.
my_loss_1 returns only the mean value of the correlation coefficients of all (20) sequences.
my_loss_2 returns only one loss for each sequence (does not work in a real training).
my_loss_3 repeats the loss for each sample within each sequence.
Many thanks and best wishes
from keras import backend as K
from keras.losses import mean_squared_error
import numpy as np
import tensorflow as tf
def my_loss_1(seq1, seq2): # Correlation-based loss function - version 1 - return scalar
seq1 = K.squeeze(seq1, axis=-1)
seq2 = K.squeeze(seq2, axis=-1)
seq1_mean = K.mean(seq1, axis=-1, keepdims=True)
seq2_mean = K.mean(seq2, axis=-1, keepdims=True)
nominator = K.sum((seq1-seq1_mean) * (seq2-seq2_mean), axis=-1)
denominator = K.sqrt( K.sum(K.square(seq1-seq1_mean), axis=-1) * K.sum(K.square(seq2-seq2_mean), axis=-1) )
corr = nominator / (denominator + K.common.epsilon())
corr_loss = K.constant(1.) - corr
corr_loss = K.mean(corr_loss)
return corr_loss
def my_loss_2(seq1, seq2): # Correlation-based loss function - version 2 - return 1D array
seq1 = K.squeeze(seq1, axis=-1)
seq2 = K.squeeze(seq2, axis=-1)
seq1_mean = K.mean(seq1, axis=-1, keepdims=True)
seq2_mean = K.mean(seq2, axis=-1, keepdims=True)
nominator = K.sum((seq1-seq1_mean) * (seq2-seq2_mean), axis=-1)
denominator = K.sqrt( K.sum(K.square(seq1-seq1_mean), axis=-1) * K.sum(K.square(seq2-seq2_mean), axis=-1) )
corr = nominator / (denominator + K.common.epsilon())
corr_loss = K.constant(1.) - corr
return corr_loss
def my_loss_3(seq1, seq2): # Correlation-based loss function - version 3 - return 2D array
seq1 = K.squeeze(seq1, axis=-1)
seq2 = K.squeeze(seq2, axis=-1)
seq1_mean = K.mean(seq1, axis=-1, keepdims=True)
seq2_mean = K.mean(seq2, axis=-1, keepdims=True)
nominator = K.sum((seq1-seq1_mean) * (seq2-seq2_mean), axis=-1)
denominator = K.sqrt( K.sum(K.square(seq1-seq1_mean), axis=-1) * K.sum(K.square(seq2-seq2_mean), axis=-1) )
corr = nominator / (denominator + K.common.epsilon())
corr_loss = K.constant(1.) - corr
corr_loss = K.reshape(corr_loss, (-1,1))
corr_loss = K.repeat_elements(corr_loss, K.int_shape(seq1)[1], 1) # Does not work for fit(). It seems that NO dimension may be None in order to get a value!=None from int_shape().
return corr_loss
# Test
sess = tf.Session()
# input (20,100,1)
a1 = np.random.rand(20,100,1)
a2 = np.random.rand(20,100,1)
print('\nInput: ' + str(a1.shape))
p1 = K.placeholder(shape=a1.shape, dtype=tf.float32)
p2 = K.placeholder(shape=a1.shape, dtype=tf.float32)
loss0 = mean_squared_error(p1,p2)
print('\nMSE:') # output: (20,100)
print(sess.run(loss0, feed_dict={p1: a1, p2: a2}))
loss1 = my_loss_1(p1,p2)
print('\nCorrelation coefficient:') # output: ()
print(sess.run(loss1, feed_dict={p1: a1, p2: a2}))
loss2 = my_loss_2(p1,p2)
print('\nCorrelation coefficient:') # output: (20,)
print(sess.run(loss2, feed_dict={p1: a1, p2: a2}))
loss3 = my_loss_3(p1,p2)
print('\nCorrelation coefficient:') # output: (20,100)
print(sess.run(loss3, feed_dict={p1: a1, p2: a2}))
Now, if we use a loss function based on the correlation coefficient
for each sequence, in theory, we will get only a single value for each
sequence, i.e., a tensor of shape (20,).
That's not true. the coefficient is something like
average((avg_label - label_value)(average_prediction - prediction_value)) /
(var(label_value)*var(prediction_value))
Remove the overall average and you are left the componenets of the correlation coefficient, per element of the sequence, which is the right shape.
You can plug in other correlation formulas as well, just stop before computing the single value.
Thanks a lot!
Well, I thought the coefficient is already the overall (averaged) metric over a sample sequence, but your solution makes sense, indeed.
Below, there is my running code (the summation in the denominator has also been changed to averaging now, otherwise the result would get smaller the longer the sequence is and this may not be as the overall loss is the mean over all losses). It works well when applied to real tasks (not shown here).
The only problem I still have is that the squeezing step at the beginning of the loss function is not so nice, but I was not able to find a nicer solution.
from keras import backend as K
from keras.losses import mean_squared_error
import numpy as np
import tensorflow as tf
def my_loss(seq1, seq2): # Correlation-based loss function
seq1 = K.squeeze(seq1, axis=-1) # To remove the last dimension
seq2 = K.squeeze(seq2, axis=-1) # To remove the last dimension
seq1_mean = K.mean(seq1, axis=-1, keepdims=True)
seq2_mean = K.mean(seq2, axis=-1, keepdims=True)
nominator = (seq1-seq1_mean) * (seq2-seq2_mean)
denominator = K.sqrt( K.mean(K.square(seq1-seq1_mean), axis=-1, keepdims=True) * K.mean(K.square(seq2-seq2_mean), axis=-1, keepdims=True) )
corr = nominator / (denominator + K.common.epsilon())
corr_loss = K.constant(1.) - corr
return corr_loss
# Test
sess = tf.Session()
# Input (20,100,1)
a1 = np.random.rand(20,100,1)
a2 = np.random.rand(20,100,1)
print('\nInput: ' + str(a1.shape))
p1 = K.placeholder(shape=a1.shape, dtype=tf.float32)
p2 = K.placeholder(shape=a1.shape, dtype=tf.float32)
loss0 = mean_squared_error(p1,p2)
print('\nMSE:') # output: (20,100)
print(sess.run(loss0, feed_dict={p1: a1, p2: a2}))
loss1 = my_loss(p1,p2)
print('\nCorrelation coefficient-based loss:') # output: (20,100)
print(sess.run(loss1, feed_dict={p1: a1, p2: a2}))