How to define a ReLU with TensorFlow custom_gradient? - tensorflow

I'm practicing using TensorFlow's custom_gradient decorator and I tried to define a simple ReLU. One would think it would be as simple as defining the gradient to be 1 when x > 0 and 0 otherwise. However, the following code does not yield the same gradients as a ReLU:
#tf.custom_gradient
def relu(x):
def grad(dy):
return tf.cond(tf.reshape(x, []) > 0,
lambda: tf.cast(tf.reshape(1, dy.shape), tf.float32),
lambda: tf.cast(tf.reshape(0, dy.shape), tf.float32))
return tf.nn.relu(x), grad
Can someone explain to me why this standard definition of ReLU's gradient does not yield the same performance as:
#tf.custom_gradient
def relu(x):
def grad(dy):
return dy
return tf.nn.relu(x), grad

Related

How To Predict Value Inside Model Loss With Using Current Model Weights?

I want to get other predicted value rather than y_true and y_pred inside model loss function. I have tried something but they didn't work so how can I use model.predict inside loss function in Tensorflow?
class CustomLoss(tf.keras.losses.Loss):
def __init__(self, anchor, positive, model):
super().__init__()
self.anchor=anchor
self.positive=positive
self.model=model
def convert(self, image):
c_im=self.model(image.reshape(1, 160, 160, 3))
print(c_im)
c_im=tf.make_ndarray(c_im)
return c_im/np.linalg.norm(c_im, ord=2)
def call(self, y_true, y_pred):
y_pred_c=tf.cast(y_pred, dtype=tf.float32)
anchor, positive = self.convert(self.anchor), self.convert(self.positive)
anchor, positive = tf.convert_to_tensor(anchor, dtype=tf.float32), tf.convert_to_tensor(positive, dtype=tf.float32)
pos_d=tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), axis=-1)
neg_d=tf.reduce_sum(tf.square(tf.subtract(anchor, y_pred_c)), axis=-1)
basic_l=tf.add(tf.subtract(pos_d, neg_d), 0.2)
print(basic_l)
del anchor
del positive
return tf.reduce_sum(tf.maximum(basic_l, 0), axis=None)

Model with normalized binary cross entropy loss does not converge

I'm trying to implement normalized binary cross entropy for a classification task following this paper: Normalized Loss Functions for Deep Learning with Noisy Labels.
The math is as follows:
Here is my implementation:
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
if self.from_logits:
logits = tf.math.sigmoid(logits)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
numer = target * tf.math.log(logits) + (1 - target) * tf.math.log(1 - logits)
denom = - (tf.math.log(logits) + tf.math.log(1 - logits))
return - numer / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I'm using this loss to train a binary classifier (CTR predictor), but loss of the model does not decrease and ROC-AUC remains at ~0.49-0.5. To verify the implementation of numerator, I tried training by removing the denominator and it's working fine.
# Example Usage
labels = np.array([[0], [1], [0], [0], [0]]).astype(np.int64)
logits = np.array([[-1.024], [2.506], [1.43], [0.004], [-2.0]]).astype(np.float64)
tf_nce = NormalizedBinaryCrossentropy(
reduction=tf.keras.losses.Reduction.NONE,
from_logits=True
)
tf_nce(labels, logits)
#<tf.Tensor: shape=(5, 1), dtype=float64, numpy=
# array([[0.18737159],
# [0.02945536],
# [0.88459308],
# [0.50144269],
# [0.05631594]])>
I checked manually with some extremes and that loss doesn't hit nans or 0s.
Can anyone help me in debugging why the model is not able to converge on this loss? Is there something wrong with my understanding of the loss function or implementation?
Edit 1: Model architecture is a Multi-Gate Mixture-of-Experts with 6 tasks. All 6 tasks are binary classification and losses from all tasks are added together to get final loss.
One thing which is mentioned in the paper as described above is that the Norm of the loss should be inclusively in between [0 ~ 1] but as your loss is violating this condition of Normalized Binary Cross Entropy and the other reason is you are dividing by the wrong denominator, you have to divide it by the Cross-Entropy of your logits for this take the BinaryCrossEntropy() of your logits. so, these can be the reasons that your function is not decreasing... I have made some changes to your code that satisfy this Norm Property...
import tensorflow as tf
from keras.utils import losses_utils
class NormalizedBinaryCrossentropy(tf.keras.losses.Loss):
def __init__(
self,
from_logits=False,
label_smoothing=0.0,
axis=-1,
reduction=tf.keras.losses.Reduction.NONE,
name="normalized_binary_crossentropy",
**kwargs
):
super().__init__(
reduction=reduction, name=name
)
self.from_logits = from_logits
self._epsilon = tf.keras.backend.epsilon()
def call(self, target, logits):
if tf.is_tensor(logits) and tf.is_tensor(target):
logits, target = losses_utils.squeeze_or_expand_dimensions(
logits, target
)
logits = tf.convert_to_tensor(logits)
target = tf.cast(target, logits.dtype)
logits = tf.clip_by_value(logits, self._epsilon, 1.0 - self._epsilon)
if self.from_logits:
numer = tf.keras.losses.binary_crossentropy(target, logits,from_logits=True)[:,tf.newaxis]
denom = -( tf.math.log(logits) + tf.math.log(1 - logits))
return numer * denom / tf.reduce_sum(denom)
else:
logits = tf.nn.log_softmax(logits)
num = - tf.math.reduce_sum(tf.multiply(target, logits), axis=1)
denom = -tf.math.reduce_sum(logits, axis=1)
return num / denom
def get_config(self):
config = super().get_config()
config.update({"from_logits": self._from_logits})
return config
I have updated the solution, there are two ways for computing the BCE if your logits are one-hot then set from_logit=False else set it True.
I would try to avoid log-Sigmoid stability issues and try to implement the above model as a 2 class problem with Softmax Binary cross entropy..
The NormalizedCrossEntropy is defined as:
class NormalizedCrossEntropy(keras.layers.Layer):
def __init__(self, num_classes):
super(NormalizedCrossEntropy, self).__init__()
self.num_classes = num_classes
def call(self, pred, labels):
pred = tf.nn.log_softmax(pred, axis=1,)
label_one_hot = tf.one_hot(labels, self.num_classes)
numer = -1 * tf.reduce_sum(label_one_hot * pred, axis=1)
denom = -1* tf.reduce_sum(pred, axis=1)
nce = numer/ denom
return nce
Example usage:
NormalizedCrossEntropy(num_classes=2)(np.array([[-1.024, 0.5], [0.1, 2.506], [1, .0], [0., 1.], [-0.89, -2.0]]), np.array([0, 1, 0, 0, 0]) )
#array([0.89725673, 0.03348167, 0.19259584, 0.80740416, 0.16958274]

Tensorflow custom activation function with tf.cond

I'm trying to write a custom activation function using tf.custom_gradient. Specifically I want to use the taylor expansion of 1/x for x<1 and 1/x otherwise. Here's my code:
#tf.custom_gradient
def taylor_inverse(x):
def func(x):
return(tf.cond(x<1, taylor(x), tf.math.reciprocal(x)))
def grad(upstream):
return(tf.cond(upstream<1, taylor_grad(upstream), inv_diff(upstream)))
return func(x), grad
#tf.function
def taylor(x):
return(4 - 6 * x + 4 * x ** 2 - x ** 3)
#tf.function
def taylor_grad(x):
return(-3 * x ** 2 + 8 * x - 6)
#tf.function
def inv_diff(x):
return(-tf.math.reciprocal(x)**2)
I get the error message:
TypeError: 'Tensor' object is not callable
Equations are -x3+4x2-6x+4 and for the gradient -3x2+8x-6, and I get error in this line:
layer_inverse = Lambda(lambda x: taylor_inverse(x),output_shape=(1,))(layer)
Thank you for your help
tf.cond second and third arguments should be callable function. So, use it like this:
#tf.custom_gradient
def taylor_inverse(x):
def func(x):
return(tf.cond(x<1, lambda: taylor(x), lambda: tf.math.reciprocal(x)))
def grad(upstream):
return(tf.cond(upstream<1, lambda: taylor_grad(upstream), lambda: inv_diff(upstream)))
return func(x), grad

How to use sampled_softmax_loss function in tensorflow's model_with_buckets?

In tensorflow's model_with_buckets api function, there is a parameter named softmax_loss_function, the signature of which is Function (labels, logits) -> loss-batch.
def model_with_buckets(
encoder_inputs,
decoder_inputs,
targets,
weights,
buckets,
seq2seq,
softmax_loss_function=None,
per_example_loss=False,
name=None
)
The traditional way of assigning softmax_loss_function is using anything like softmax_cross_entropy_with_logits:
def softmax_cross_entropy_with_logits(
_sentinel=None,
labels=None,
logits=None,
dim=-1,
name=None
)
While the sampled_softmax_loss function has following definition:
def sampled_softmax_loss(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
partition_strategy="mod",
name="sampled_softmax_loss"):
In an official tensorflow seq2seq example about translate english to france, it assigned sampled_softmax_loss to model_with_bucket paramater with a wraper function sampled_loss as following:
w = tf.get_variable("proj_w", [size, self.target_vocab_size], dtype=tf.float32)
w_t = tf.transpose(w)
b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=tf.float32)
output_projection = (w, b)
# maybe need Function (labels, logits)
def sampled_loss(labels, inputs):
labels = tf.reshape(labels, [-1, 1])
return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size)
softmax_loss_function = sampled_loss
It's wield to me. And the code runs with a lot of errors. Because inputs is not equal logits. In my understanding, logits = inputs * weights + biases. So, what is the right way to use sampled_softmax_loss function in tensorflow's model_with_buckets?

parallelising tf.data.Dataset.from_generator

I have a non trivial input pipeline that from_generator is perfect for...
dataset = tf.data.Dataset.from_generator(complex_img_label_generator,
(tf.int32, tf.string))
dataset = dataset.batch(64)
iter = dataset.make_one_shot_iterator()
imgs, labels = iter.get_next()
Where complex_img_label_generator dynamically generates images and returns a numpy array representing a (H, W, 3) image and a simple string label. The processing not something I can represent as reading from files and tf.image operations.
My question is about how to parallise the generator? How do I have N of these generators running in their own threads.
One thought was to use dataset.map with num_parallel_calls to handle the threading; but the map operates on tensors... Another thought was to create multiple generators each with it's own prefetch and somehow join them, but I can't see how I'd join N generator streams?
Any canonical examples I could follow?
Turns out I can use Dataset.map if I make the generator super lightweight (only generating meta data) and then move the actual heavy lighting into a stateless function. This way I can parallelise just the heavy lifting part with .map using a py_func.
Works; but feels a tad clumsy... Would be great to be able to just add num_parallel_calls to from_generator :)
def pure_numpy_and_pil_complex_calculation(metadata, label):
# some complex pil and numpy work nothing to do with tf
...
dataset = tf.data.Dataset.from_generator(lightweight_generator,
output_types=(tf.string, # metadata
tf.string)) # label
def wrapped_complex_calulation(metadata, label):
return tf.py_func(func = pure_numpy_and_pil_complex_calculation,
inp = (metadata, label),
Tout = (tf.uint8, # (H,W,3) img
tf.string)) # label
dataset = dataset.map(wrapped_complex_calulation,
num_parallel_calls=8)
dataset = dataset.batch(64)
iter = dataset.make_one_shot_iterator()
imgs, labels = iter.get_next()
I am working on a from_indexable for tf.data.Dataset https://github.com/tensorflow/tensorflow/issues/14448
The advantage for from_indexable is that it can be parallelized, while a python generator cannot be parallelized.
The function from_indexable makes a tf.data.range, wraps the indexable in a generalized tf.py_func and calls map.
For those that want now a from_indexable, here the lib code
import tensorflow as tf
import numpy as np
from tensorflow.python.framework import tensor_shape
from tensorflow.python.util import nest
def py_func_decorator(output_types=None, output_shapes=None, stateful=True, name=None):
def decorator(func):
def call(*args):
nonlocal output_shapes
flat_output_types = nest.flatten(output_types)
flat_values = tf.py_func(
func,
inp=args,
Tout=flat_output_types,
stateful=stateful, name=name
)
if output_shapes is not None:
# I am not sure if this is nessesary
output_shapes = nest.map_structure_up_to(
output_types, tensor_shape.as_shape, output_shapes)
flattened_shapes = nest.flatten_up_to(output_types, output_shapes)
for ret_t, shape in zip(flat_values, flattened_shapes):
ret_t.set_shape(shape)
return nest.pack_sequence_as(output_types, flat_values)
return call
return decorator
def from_indexable(iterator, output_types, output_shapes=None, num_parallel_calls=None, stateful=True, name=None):
ds = tf.data.Dataset.range(len(iterator))
#py_func_decorator(output_types, output_shapes, stateful=stateful, name=name)
def index_to_entry(index):
return iterator[index]
return ds.map(index_to_entry, num_parallel_calls=num_parallel_calls)
and here an example (Note: from_indexable has a num_parallel_calls argument)
class PyDataSet:
def __len__(self):
return 20
def __getitem__(self, item):
return np.random.normal(size=(item+1, 10))
ds = from_indexable(PyDataSet(), output_types=tf.float64, output_shapes=[None, 10])
it = ds.make_one_shot_iterator()
entry = it.get_next()
with tf.Session() as sess:
print(sess.run(entry).shape)
print(sess.run(entry).shape)
Update June 10, 2018:
Since https://github.com/tensorflow/tensorflow/pull/15121 is merged, the code for from_indexable simplifies to:
import tensorflow as tf
def py_func_decorator(output_types=None, output_shapes=None, stateful=True, name=None):
def decorator(func):
def call(*args, **kwargs):
return tf.contrib.framework.py_func(
func=func,
args=args, kwargs=kwargs,
output_types=output_types, output_shapes=output_shapes,
stateful=stateful, name=name
)
return call
return decorator
def from_indexable(iterator, output_types, output_shapes=None, num_parallel_calls=None, stateful=True, name=None):
ds = tf.data.Dataset.range(len(iterator))
#py_func_decorator(output_types, output_shapes, stateful=stateful, name=name)
def index_to_entry(index):
return iterator[index]
return ds.map(index_to_entry, num_parallel_calls=num_parallel_calls)
Limiting the work done in the generator to a minimum and parallelizing the expensive processing using a map is sensible.
Alternatively, you can "join" multiple generators using parallel_interleave as follows:
def generator(n):
# returns n-th generator function
def dataset(n):
return tf.data.Dataset.from_generator(generator(n))
ds = tf.data.Dataset.range(N).apply(tf.contrib.data.parallel_interleave(dataset, cycle_lenght=N))
# where N is the number of generators you use