Related
I am trying to learn a similarity matrix(M) between two image embeddings, A single instance of training is a pair of images - (anchor, positive). So ideally the model will return 0 distance for embeddings of similar images.
The problem is, when i declare the distance matrix(M) as a tf.Variable, it returns an error
on this line
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
TypeError: 'Variable' object is not iterable.
I think I should use a tensorflow datatype for M, that is iterable
Please tell me how I can fix this issue
import tensorflow as tf
from tensorflow import keras
# metric learning model
class MetricLearningModel:
def __init__(self, lr):
self.optimizer = keras.optimizers.Adam(lr=lr)
self.lr = lr
self.loss_object = keras.losses.MeanSquaredError()
self.trainable_variables = tf.Variable(
(tf.ones((2048, 2048), dtype=tf.float32)),
trainable=True
)
def similarity_function(self, anchor_embeddings, positive_embeddings):
M = self.trainable_variables
X_i = anchor_embeddings
X_j = positive_embeddings
similarity_value = tf.matmul(X_j, M, name='Tensor')
similarity_value = tf.matmul(similarity_value, tf.transpose(X_i), name='Tensor')
# distance(x,y) = sqrt( (x-y)#M#(x-y).T )
return similarity_value
def train_step(self, anchor, positive):
anchor_embeddings, positive_embeddings = anchor, positive
# Calculate gradients
with tf.GradientTape() as tape:
# Calculate similarity between anchors and positives.
similarities = self.similarity_function(anchor_embeddings, positive_embeddings)
y_pred = similarities
y_true = tf.zeros(1)
print(y_true, y_pred)
loss_value = self.loss_object(
y_pred=y_true,
y_true=y_pred,
)
gradients = tape.gradient(loss_value, self.trainable_variables)
# Apply gradients via optimizer
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metric_model = MetricLearningModel(lr=1e-3)
anchor, positive = tf.ones((1, 2048), dtype=tf.float32), tf.ones((1, 2048), dtype=tf.float32)
metric_model.train_step(anchor, positive)
The python zip function expects iterable objects, like for example a list or a tuple.
In your calls to tape.gradient, or optimizer.apply_gradients, you can put your Variable in a list to solve the issue :
with tf.GradienTape() as tape:
gradients = tape.gradient(loss_value, [self.trainable_variables])
# Apply gradients via optimizer
self.optimizer.apply_gradients(zip(gradients, [self.trainable_variables]))
tape.gradient respects the shape of the sources object passed to compute the gradients of, so if you feed it with a list, you will get a list out of it. It is stated in the documentation:
Returns
a list or nested structure of Tensors (or IndexedSlices, or None), one for each element in sources. Returned structure is the same as the structure of sources.
I have created custom loss (Weighted Absolute error) in keras but implementation doesn't work - I get an error ValueError: No gradients provided for any variable: ['my_model/conv2d/kernel:0', 'my_model/conv2d/bias:0'].
I want to apply different weight for each pixel.
class WeightedMeanAbsoluteError(tf.keras.metrics.Metric):
def __init__(self, name='weighted_mean_absolute_error'):
super(WeightedMeanAbsoluteError, self).__init__(name=name)
self.wmae = self.add_weight(name='wmae', initializer='zeros')
def update_state(self, y_true, y_pred, loss_weights):
values = tf.math.abs(y_true - y_pred) * loss_weights
return self.wmae.assign_add(tf.reduce_sum(values))
def result(self):
return self.wmae
def reset_states(self):
# The state of the metric will be reset at the start of each epoch.
self.wmae.assign(0.)
loss_object = WeightedMeanAbsoluteError()
train_loss = WeightedMeanAbsoluteError()
I use the following code to implement a training step:
#tf.function
def train_step(input_images, output_images):
with tf.GradientTape() as tape:
# training=True is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
result_images = model(input_images, training=True)
loss = loss_object(output_images, result_images)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
Also my code works just fine if I use
loss_object = tf.keras.losses.MeanAbsoluteError()
train_loss = tf.keras.metrics.MeanAbsoluteError()
The best and simple way to minimize a weighted standard loss (such mae) is using the sample_weights parameter in fit method where we pass an array with the desired weight of each sample
X = np.random.uniform(0,1, (1000,50))
y = np.random.uniform(0,1, 1000)
W = np.random.randint(1,10, 1000)
inp = Input((50))
x = Dense(64, activation='relu')(inp)
out = Dense(10)(x)
model = Model(inp, out)
model.compile('adam','mae')
model.fit(X,y, epochs=100, sample_weights=W)
trying here to make an eager exec model work with LR decay, but no success. It seems to be a bug, since it appear that the learning rate decay tensor does not get updated. If I am missing something can you land a hand here. Thanks.
The code bellow is learning some word embeddings. However, the learning rate decay section does not work at all.
class Word2Vec(tf.keras.Model):
def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
self.vocab_size = vocab_size
self.num_sampled = num_sampled
self.embed_matrix = tfe.Variable(tf.random_uniform(
[vocab_size, embed_size]), name="embedding_matrix")
self.nce_weight = tfe.Variable(tf.truncated_normal(
[vocab_size, embed_size],
stddev=1.0 / (embed_size ** 0.5)), name="weights")
self.nce_bias = tfe.Variable(tf.zeros([vocab_size]), name="biases")
def compute_loss(self, center_words, target_words):
"""Computes the forward pass of word2vec with the NCE loss."""
embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
biases=self.nce_bias,
labels=target_words,
inputs=embed,
num_sampled=self.num_sampled,
num_classes=self.vocab_size))
return loss
def gen():
yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES,
VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW,
VISUAL_FLD)
def main():
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32),
(tf.TensorShape([BATCH_SIZE]),
tf.TensorShape([BATCH_SIZE, 1])))
global_step = tf.train.get_or_create_global_step()
starter_learning_rate = 1.0
end_learning_rate = 0.01
decay_steps = 1000
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
decay_steps, end_learning_rate,
power=0.5)
train_writer = tf.contrib.summary.create_file_writer('./checkpoints')
train_writer.set_as_default()
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
total_loss = 0.0 # for average loss in the last SKIP_STEP steps
checkpoint_dir = "./checkpoints/"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
root = tfe.Checkpoint(optimizer=optimizer,
model=model,
optimizer_step=tf.train.get_or_create_global_step())
while global_step < NUM_TRAIN_STEPS:
for center_words, target_words in tfe.Iterator(dataset):
with tf.contrib.summary.record_summaries_every_n_global_steps(100):
if global_step >= NUM_TRAIN_STEPS:
break
loss_batch, grads = grad_fn(center_words, target_words)
tf.contrib.summary.scalar('loss', loss_batch)
tf.contrib.summary.scalar('learning_rate', learning_rate)
# print(grads)
# print(len(grads))
total_loss += loss_batch
optimizer.apply_gradients(grads, global_step)
if (global_step.numpy() + 1) % SKIP_STEP == 0:
print('Average loss at step {}: {:5.1f}'.format(
global_step.numpy(), total_loss / SKIP_STEP))
total_loss = 0.0
root.save(file_prefix=checkpoint_prefix)
if __name__ == '__main__':
main()
Note that when eager execution is enabled, the tf.Tensor objects represent concrete values (as opposed to symbolic handles of computation that will occur on Session.run() calls).
As a result, in your code snippet above, the line:
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
decay_steps, end_learning_rate,
power=0.5)
is computing the decayed value once, using the global_step at the time it was invoked, and when the optimizer is being created with:
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
it is being given a fixed learning rate.
To decay the learning rate, you'd want to invoke tf.train.polynomial_decay repeatedly (with updated values for global_step). One way to do this would be to replicate what is done in the RNN example, using something like this:
starter_learning_rate = 1.0
learning_rate = tfe.Variable(starter_learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
while global_step < NUM_TRAIN_STEPS:
# ....
learning_rate.assign(tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5))
This way you've captured the learning_rate in a variable that can be updated. Furthermore, it's simple to include the current learning_rate in the checkpoint as well (by including it when creating the Checkpoint object).
Hope that helps.
I have a working RNN using the default softmax loss function for tf.contrib.seq2seq.sequence_loss() (which I'm assuming is tf.nn.softmax()) but would instead like to use tf.nn.softmax_cross_entropy_with_logits(). According to the seq2seq.sequence_loss documentation, one may use softmax_loss_function= to override the default loss function:
softmax_loss_function: Function (labels, logits) -> loss-batch to be
used instead of the standard softmax (the default if this is None).
Note that to avoid confusion, it is required for the function to
accept named arguments.
Here is my code that works:
from tensorflow.python.layers.core import Dense
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
# Load the model inputs
input_data, targets, keep_prob, lr, target_sequence_length, max_target_sequence_length, source_sequence_length \
= get_model_inputs()
# Create the training and inference logits
training_decoder_output, inference_decoder_output = seq2seq_model(input_data,
targets,
lr,
target_sequence_length,
max_target_sequence_length,
source_sequence_length,
len(source_letter_to_int),
len(target_letter_to_int),
encoding_embedding_size,
decoding_embedding_size,
rnn_size,
num_layers,
keep_prob)
# Create tensors for the training logits and inference logits
training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
inference_logits = tf.identity(inference_decoder_output.sample_id, name='predictions')
# Create the weights for sequence_loss
masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
with tf.name_scope("optimization"):
# Loss function
cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
# Optimizer
optimizer = tf.train.AdamOptimizer(lr)
# Gradient Clipping
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
# Add variables to collection in order to load them up when retraining a saved graph
tf.add_to_collection("cost", cost)
tf.add_to_collection("train_op", train_op)
My attempt to change the loss function is as follows (I've only indicated the code that is different):
with tf.name_scope("optimization"):
# One-hot encode targets and reshape to match logits, one row per batch_size per step
y_one_hot = tf.one_hot(targets, len(target_letter_to_int))
y_reshaped = tf.reshape(y_one_hot, [batch_size, len(target_letter_to_int), 30])
# Loss function
loss = tf.nn.softmax_cross_entropy_with_logits(logits=training_logits, labels=y_reshaped)
loss = tf.reduce_mean(loss)
cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks, softmax_loss_function=loss)
The line cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks, softmax_loss_function=loss) is now giving me "TypeError: 'Tensor' object is not callable." This is one of the most opaque errors I've seen Tensorflow produce and I haven't found much of anything in the way of explanation on the internet. Any help would be appreciated.
I had tried several versions of batch_normalization in tensorflow, but none of them worked! The results were all incorrect when I set batch_size = 1 at inference time.
Version 1: directly use the official version in tensorflow.contrib
from tensorflow.contrib.layers.python.layers.layers import batch_norm
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
Version 2: from How could I use Batch Normalization in TensorFlow?
def batch_norm_layer(x, train_phase, scope_bn='bn'):
bn_train = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training is a placeholder at training time is True and False at inference time.
version 3: from slim https://github.com/tensorflow/models/blob/master/inception/inception/slim/ops.py
def batch_norm_layer(inputs,
is_training=True,
scope='bn'):
decay=0.999
epsilon=0.001
inputs_shape = inputs.get_shape()
with tf.variable_scope(scope) as t_scope:
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
beta = tf.Variable(tf.zeros_initializer(params_shape),
name='beta',
trainable=True)
gamma = tf.Variable(tf.ones_initializer(params_shape),
name='gamma',
trainable=True)
moving_mean = tf.Variable(tf.zeros_initializer(params_shape),
name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones_initializer(params_shape),
name='moving_variance',
trainable=False)
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
outputs = tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
outputs.set_shape(inputs.get_shape())
return outputs
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
version 4: like version3, but add tf.control_dependencies
def batch_norm_layer(inputs,
decay=0.999,
center=True,
scale=True,
epsilon=0.001,
moving_vars='moving_vars',
activation=None,
is_training=True,
trainable=True,
restore=True,
scope='bn',
reuse=None):
inputs_shape = inputs.get_shape()
with tf.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse):
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta = tf.Variable(tf.zeros(params_shape), name='beta')
gamma = tf.Variable(tf.ones(params_shape), name='gamma')
# Create moving_mean and moving_variance add them to
# GraphKeys.MOVING_AVERAGE_VARIABLES collections.
moving_mean = tf.Variable(tf.zeros(params_shape), name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones(params_shape), name='moving_variance',
trainable=False)
control_inputs = []
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
control_inputs = [update_moving_mean, update_moving_variance]
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
with tf.control_dependencies(control_inputs):
return tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
The 4 versions of Batch_normalization are all not correct. So, how to use batch normalization correctly?
Another strange phenomenon is if I set batch_norm_layer to null like this, the inference result are all same.
def batch_norm_layer(inputs, is_training):
return inputs
I have tested that the following simplified implementation of batch normalization gives the same result as tf.contrib.layers.batch_norm as long as the setting is the same.
def initialize_batch_norm(scope, depth):
with tf.variable_scope(scope) as bnscope:
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0))
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0))
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
bnscope.reuse_variables()
def BatchNorm_layer(x, scope, train, epsilon=0.001, decay=.99):
# Perform a batch normalization after a conv layer or a fc layer
# gamma: a scale factor
# beta: an offset
# epsilon: the variance epsilon - a small float number to avoid dividing by 0
with tf.variable_scope(scope, reuse=True):
with tf.variable_scope('BatchNorm', reuse=True) as bnscope:
gamma, beta = tf.get_variable("gamma"), tf.get_variable("beta")
moving_avg, moving_var = tf.get_variable("moving_avg"), tf.get_variable("moving_var")
shape = x.get_shape().as_list()
control_inputs = []
if train:
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
The main tips with using the official implementation of batch normalization in tf.contrib.layers.batch_norm are: (1) set is_training=True for training time and is_training=False for validation and testing time; (2) set updates_collections=None to make sure that moving_variance and moving_mean are updated in place; (3) be aware and careful with the scope setting; (4) set decay to be a smaller value (decay=0.9 or decay=0.99) than default value (default is 0.999) if your dataset is small or your total training updates/steps are not that large.
I found the Zhongyu Kuang's code really useful, but I stuck on how to dynamically switch between train and test ops, i.e. how to move from a python boolean is_training to a tensorflow boolean placeholder is_training. I need this functionality to be able to test the network on the validation set during the training.
Starting from his code and inspired by this, I wrote the following code:
def batch_norm(x, scope, is_training, epsilon=0.001, decay=0.99):
"""
Returns a batch normalization layer that automatically switch between train and test phases based on the
tensor is_training
Args:
x: input tensor
scope: scope name
is_training: boolean tensor or variable
epsilon: epsilon parameter - see batch_norm_layer
decay: epsilon parameter - see batch_norm_layer
Returns:
The correct batch normalization layer based on the value of is_training
"""
assert isinstance(is_training, (ops.Tensor, variables.Variable)) and is_training.dtype == tf.bool
return tf.cond(
is_training,
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=True, reuse=None),
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=False, reuse=True),
)
def batch_norm_layer(x, scope, is_training, epsilon=0.001, decay=0.99, reuse=None):
"""
Performs a batch normalization layer
Args:
x: input tensor
scope: scope name
is_training: python boolean value
epsilon: the variance epsilon - a small float number to avoid dividing by 0
decay: the moving average decay
Returns:
The ops of a batch normalization layer
"""
with tf.variable_scope(scope, reuse=reuse):
shape = x.get_shape().as_list()
# gamma: a trainable scale factor
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0), trainable=True)
# beta: a trainable shift value
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0), trainable=True)
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
if is_training:
# tf.nn.moments == Calculate the mean and the variance of the tensor x
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
control_inputs = []
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
Then I use the batch_norm layer in this way:
fc1_weights = tf.Variable(...)
fc1 = tf.matmul(x, fc1_weights)
fc1 = batch_norm(fc1, 'fc1_bn', is_training=is_training)
fc1 = tf.nn.relu(fc1)
Where is_training is a boolean placeholder. Note that the bias addition is not needed because is replaced by the beta parameter as explained in the Batch Normalization paper.
During execution:
# Training phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: True})
# Testing phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: False})