Can't apply gradients on tf.Variable - tensorflow

I am trying to learn a similarity matrix(M) between two image embeddings, A single instance of training is a pair of images - (anchor, positive). So ideally the model will return 0 distance for embeddings of similar images.
The problem is, when i declare the distance matrix(M) as a tf.Variable, it returns an error
on this line
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
TypeError: 'Variable' object is not iterable.
I think I should use a tensorflow datatype for M, that is iterable
Please tell me how I can fix this issue
import tensorflow as tf
from tensorflow import keras
# metric learning model
class MetricLearningModel:
def __init__(self, lr):
self.optimizer = keras.optimizers.Adam(lr=lr)
self.lr = lr
self.loss_object = keras.losses.MeanSquaredError()
self.trainable_variables = tf.Variable(
(tf.ones((2048, 2048), dtype=tf.float32)),
trainable=True
)
def similarity_function(self, anchor_embeddings, positive_embeddings):
M = self.trainable_variables
X_i = anchor_embeddings
X_j = positive_embeddings
similarity_value = tf.matmul(X_j, M, name='Tensor')
similarity_value = tf.matmul(similarity_value, tf.transpose(X_i), name='Tensor')
# distance(x,y) = sqrt( (x-y)#M#(x-y).T )
return similarity_value
def train_step(self, anchor, positive):
anchor_embeddings, positive_embeddings = anchor, positive
# Calculate gradients
with tf.GradientTape() as tape:
# Calculate similarity between anchors and positives.
similarities = self.similarity_function(anchor_embeddings, positive_embeddings)
y_pred = similarities
y_true = tf.zeros(1)
print(y_true, y_pred)
loss_value = self.loss_object(
y_pred=y_true,
y_true=y_pred,
)
gradients = tape.gradient(loss_value, self.trainable_variables)
# Apply gradients via optimizer
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metric_model = MetricLearningModel(lr=1e-3)
anchor, positive = tf.ones((1, 2048), dtype=tf.float32), tf.ones((1, 2048), dtype=tf.float32)
metric_model.train_step(anchor, positive)

The python zip function expects iterable objects, like for example a list or a tuple.
In your calls to tape.gradient, or optimizer.apply_gradients, you can put your Variable in a list to solve the issue :
with tf.GradienTape() as tape:
gradients = tape.gradient(loss_value, [self.trainable_variables])
# Apply gradients via optimizer
self.optimizer.apply_gradients(zip(gradients, [self.trainable_variables]))
tape.gradient respects the shape of the sources object passed to compute the gradients of, so if you feed it with a list, you will get a list out of it. It is stated in the documentation:
Returns
a list or nested structure of Tensors (or IndexedSlices, or None), one for each element in sources. Returned structure is the same as the structure of sources.

Related

TPU training error: "No registered 'Cumsum' OpKernel for XLA_TPU_JIT devices compatible with node {{node RaggedConcat/Cumsum"

I am trying to train a model with a custom train step on TPU. The training works fine on GPU, but not on TPU. I believe I am not using a tensorflow operation that is not supported, according to this [ https://cloud.google.com/tpu/docs/tensorflow-ops ] but the list for both the supported and unsupported functions are not exhaustive, and I am using functions not listed on either.
The error message mentions a ragged tensor, and there are only two lines of code where I have ragged tensors, which are in train_step
class CustomModel(tf.keras.Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
x = data
positveLabel = tf.constant( [1, 0], dtype=tf.int32 )
negativeLabel = tf.constant( [0, 1], dtype=tf.int32 )
pLabelBatch = tf.reshape(tf.tile( positveLabel, [tf.shape(x[0])[0]], name=None), [tf.shape(x[0])[0],2] )
nLabelBatch = tf.reshape(tf.tile( negativeLabel, [tf.shape(x[0])[0]], name=None), [tf.shape(x[0])[0],2] )
y = (pLabelBatch, pLabelBatch, nLabelBatch, nLabelBatch)
batch_label = tf.reshape(y, (tf.size(y)/2, 2), name=None)
rs = tf.ragged.stack(x, axis=0)
reg = rs.to_tensor()
batch_input = tf.reshape(reg, (tf.shape(reg)[0]*tf.shape(reg)[1], tf.shape(reg)[2]))
with tf.GradientTape() as tape:
y_pred = self(batch_input, training=True) # Forward pass
# Compute the loss value
# (the loss function is configured in `compile()`)
loss = self.compiled_loss(batch_label, y_pred, regularization_losses=self.losses)
# Compute gradients
_minimize(self.distribute_strategy, tape, self.optimizer, loss,
self.trainable_variables)
# Update weights
# self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update metrics (includes the metric that tracks the loss)
self.compiled_metrics.update_state(y, y_pred)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
specifically
rs = tf.ragged.stack(x, axis=0)
reg = rs.to_tensor()
I couldn't find anything online about ragged tensors being supported or unsupported.
I am trying to figure out how to fully interpret the error message.

Tensorflow : IOU per class

I'm trying to use deeplab for semantic segmentation. I'd like to calculate IOU per class(IOU for person only) instead of mean IOU.
At L142 of
https://github.com/tensorflow/models/blob/master/research/deeplab/eval.py,
I tried to get confusion matrix instead of mean IOU by
miou, cmat = tf.metrics.mean_iou(...)
metric_map['cmat'] = cmat
but it did not work.
I'd appreciate if someone suggest me how to get around.
You can use _streaming_confusion_matrix from tensorflow.python.ops.metrics_impl to get the confusion matrix.
Essentially it works the same way as other running metrics like mean_iou. which means, you get two ops when calling this metric, a total confusion_matrix op and an update op that updates the confusion matrix cumulatively.
With the confusion matrix, now you should be able to compute the class wise iou
I implemented a class-specific IoU metric for this very purpose based on the MeanIoU class.
class ClassIoU(tf.keras.metrics.MeanIoU):
"""Computes the class-specific Intersection-Over-Union metric.
IOU is defined as follows:
IOU = true_positive / (true_positive + false_positive + false_negative).
The predictions are accumulated in a confusion matrix, weighted by
`sample_weight` and the metric is then calculated from it.
If `sample_weight` is `None`, weights default to 1.
Use `sample_weight` of 0 to mask values.
Args:
class_idx: The index of the the class of interest
one_hot: Indicates if the input is a one_hot vector as in CategoricalCrossentropy or if the class indices
are used as in SparseCategoricalCrossentropy or MeanIoU
num_classes: The possible number of labels the prediction task can have.
This value must be provided, since a confusion matrix of dimension =
[num_classes, num_classes] will be allocated.
name: (Optional) string name of the metric instance.
dtype: (Optional) data type of the metric result.
"""
def __init__(self, class_idx, one_hot, num_classes, name=None, dtype=None):
super().__init__(num_classes, name, dtype)
self.one_hot = one_hot
self.class_idx = class_idx
def result(self):
sum_over_row = tf.cast(
tf.reduce_sum(self.total_cm, axis=0), dtype=self._dtype)
sum_over_col = tf.cast(
tf.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
true_positives = tf.cast(
tf.linalg.diag_part(self.total_cm), dtype=self._dtype)
# sum_over_row + sum_over_col =
# 2 * true_positives + false_positives + false_negatives.
denominator = sum_over_row[self.class_idx] + sum_over_col[self.class_idx] \
- true_positives[self.class_idx]
# The mean is only computed over classes that appear in the
# label or prediction tensor. If the denominator is 0, we need to
# ignore the class.
num_valid_entries = tf.reduce_sum(
tf.cast(tf.not_equal(denominator, 0), dtype=self._dtype))
iou = tf.math.divide_no_nan(true_positives[self.class_idx], denominator)
return tf.math.divide_no_nan(
tf.reduce_sum(iou, name='mean_iou'), num_valid_entries)
def update_state(self, y_true, y_pred, sample_weight=None):
if self.one_hot:
return super().update_state(tf.argmax(y_true, axis=-1), tf.argmax(y_pred, axis=-1), sample_weight)
else:
return super().update_state(y_true, y_pred, sample_weight)

How to use sampled_softmax_loss function in tensorflow's model_with_buckets?

In tensorflow's model_with_buckets api function, there is a parameter named softmax_loss_function, the signature of which is Function (labels, logits) -> loss-batch.
def model_with_buckets(
encoder_inputs,
decoder_inputs,
targets,
weights,
buckets,
seq2seq,
softmax_loss_function=None,
per_example_loss=False,
name=None
)
The traditional way of assigning softmax_loss_function is using anything like softmax_cross_entropy_with_logits:
def softmax_cross_entropy_with_logits(
_sentinel=None,
labels=None,
logits=None,
dim=-1,
name=None
)
While the sampled_softmax_loss function has following definition:
def sampled_softmax_loss(weights,
biases,
labels,
inputs,
num_sampled,
num_classes,
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
partition_strategy="mod",
name="sampled_softmax_loss"):
In an official tensorflow seq2seq example about translate english to france, it assigned sampled_softmax_loss to model_with_bucket paramater with a wraper function sampled_loss as following:
w = tf.get_variable("proj_w", [size, self.target_vocab_size], dtype=tf.float32)
w_t = tf.transpose(w)
b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=tf.float32)
output_projection = (w, b)
# maybe need Function (labels, logits)
def sampled_loss(labels, inputs):
labels = tf.reshape(labels, [-1, 1])
return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size)
softmax_loss_function = sampled_loss
It's wield to me. And the code runs with a lot of errors. Because inputs is not equal logits. In my understanding, logits = inputs * weights + biases. So, what is the right way to use sampled_softmax_loss function in tensorflow's model_with_buckets?

Using `softmax_cross_entropy_with_logits()` with `seq2seq.sequence_loss()`

I have a working RNN using the default softmax loss function for tf.contrib.seq2seq.sequence_loss() (which I'm assuming is tf.nn.softmax()) but would instead like to use tf.nn.softmax_cross_entropy_with_logits(). According to the seq2seq.sequence_loss documentation, one may use softmax_loss_function= to override the default loss function:
softmax_loss_function: Function (labels, logits) -> loss-batch to be
used instead of the standard softmax (the default if this is None).
Note that to avoid confusion, it is required for the function to
accept named arguments.
Here is my code that works:
from tensorflow.python.layers.core import Dense
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
# Load the model inputs
input_data, targets, keep_prob, lr, target_sequence_length, max_target_sequence_length, source_sequence_length \
= get_model_inputs()
# Create the training and inference logits
training_decoder_output, inference_decoder_output = seq2seq_model(input_data,
targets,
lr,
target_sequence_length,
max_target_sequence_length,
source_sequence_length,
len(source_letter_to_int),
len(target_letter_to_int),
encoding_embedding_size,
decoding_embedding_size,
rnn_size,
num_layers,
keep_prob)
# Create tensors for the training logits and inference logits
training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
# Create the weights for sequence_loss
masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
with tf.name_scope("optimization"):
# Loss function
cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
# Optimizer
optimizer = tf.train.AdamOptimizer(lr)
# Gradient Clipping
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
# Add variables to collection in order to load them up when retraining a saved graph
tf.add_to_collection("cost", cost)
tf.add_to_collection("train_op", train_op)
My attempt to change the loss function is as follows (I've only indicated the code that is different):
with tf.name_scope("optimization"):
# One-hot encode targets and reshape to match logits, one row per batch_size per step
y_one_hot = tf.one_hot(targets, len(target_letter_to_int))
y_reshaped = tf.reshape(y_one_hot, [batch_size, len(target_letter_to_int), 30])
# Loss function
loss = tf.nn.softmax_cross_entropy_with_logits(logits=training_logits, labels=y_reshaped)
loss = tf.reduce_mean(loss)
cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks, softmax_loss_function=loss)
The line cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks, softmax_loss_function=loss) is now giving me "TypeError: 'Tensor' object is not callable." This is one of the most opaque errors I've seen Tensorflow produce and I haven't found much of anything in the way of explanation on the internet. Any help would be appreciated.

Compute gradient norm of each part of composite loss function

Assume I have the following loss function:
loss_a = tf.reduce_mean(my_loss_fn(model_output, targets))
loss_b = tf.reduce_mean(my_other_loss_fn(model_output, targets))
loss_final = loss_a + tf.multiply(alpha, loss_b)
To visualize the norm of the gradients w.r.t to loss_final one could do this:
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
grads_and_vars = optimizer.compute_gradients(loss_final)
grads, _ = list(zip(*grads_and_vars))
norms = tf.global_norm(grads)
gradnorm_s = tf.summary.scalar('gradient norm', norms)
train_op = optimizer.apply_gradients(grads_and_vars, name='train_op')
However, I would like to plot the norm of the gradients w.r.t to loss_a and loss_b separately. How can I do this in the most efficient way? Do I have to call compute_gradients(..) on both loss_a and loss_b separately and then add those two gradients together before passing them to optimizer.apply_gradients(..)? I know that this would mathematically be correct due to the summation rule, but it just seems a bit cumbersome and I also don't know how you would implement the summation of the gradients correctly. Also, loss_final is rather simple, because it's just a summation. What if loss_final was more complicated, e.g. a division?
I'm using Tensorflow 0.12.
You are right that combining gradients could get messy. Instead just compute the gradients of each of the losses as well as the final loss. Because tensorflow optimizes the directed acyclic graph (DAG) before compilation, this doesn't result in duplication of work.
For example:
import tensorflow as tf
with tf.name_scope('inputs'):
W = tf.Variable(dtype=tf.float32, initial_value=tf.random_normal((4, 1), dtype=tf.float32), name='W')
x = tf.random_uniform((6, 4), dtype=tf.float32, name='x')
with tf.name_scope('outputs'):
y = tf.matmul(x, W, name='y')
def my_loss_fn(output, targets, name):
return tf.reduce_mean(tf.abs(output - targets), name=name)
def my_other_loss_fn(output, targets, name):
return tf.sqrt(tf.reduce_mean((output - targets) ** 2), name=name)
def get_tensors(loss_fn):
loss = loss_fn(y, targets, 'loss')
grads = tf.gradients(loss, W, name='gradients')
norm = tf.norm(grads, name='norm')
return loss, grads, norm
targets = tf.random_uniform((6, 1))
with tf.name_scope('a'):
loss_a, grads_a, norm_a = get_tensors(my_loss_fn)
with tf.name_scope('b'):
loss_b, grads_b, norm_b = get_tensors(my_loss_fn)
with tf.name_scope('combined'):
loss = tf.add(loss_a, loss_b, name='loss')
grad = tf.gradients(loss, W, name='gradients')
with tf.Session() as sess:
tf.global_variables_initializer().run(session=sess)
writer = tf.summary.FileWriter('./tensorboard_results', sess.graph)
res = sess.run([norm_a, norm_b, grad])
print(*res, sep='\n')
Edit: In response to your comment... You can check the DAG of a tensorflow model using tensorboard. I've updated the code to store the graph.
Run tensorboard --logdir $PWD/tensorboard_results in a terminal and navigate to the url printed on the commandline (typically http://localhost:6006/). Then click on GRAPH tab to view the DAG. You can recursively expand the tensors, ops, namespaces to see subgraphs to see individual operations and their inputs.