zip object for opt.apply_gradients - tensorflow

There is a program including an optimiziton function, which has following code segment to compute gradient
if hypes['clip_norm'] > 0:
grads, tvars = zip(*grads_and_vars)
clip_norm = hypes["clip_norm"]
clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
grads_and_vars = zip(clipped_grads, tvars)
print('grads_and_vars ',grads_and_vars)
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = (grads_and_vars,
global_step=global_step)
However, running the program raises the following error
File "/home/FCN/kittiseg/hypes/../optimizer/generic_optimizer.py", line 92, in training
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
File "tensorflow/tf_0.12/lib/python3.4/site-packages/tensorflow/python/training/optimizer.py", line 370, in apply_gradients
raise ValueError("No variables provided.")
ValueError: No variables provided.
I digged into the code, and think it is caused by the variable grads_and_var. I printed it out, which is just grads_and_vars <zip object at 0x2b0d6c27e348>. But I don't know how to analyze it and what causes the
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
fail?
This is the original training function
def training(hypes, loss, global_step, learning_rate, opt=None):
"""Sets up the training Ops.
Creates a summarizer to track the loss over time in TensorBoard.
Creates an optimizer and applies the gradients to all trainable variables.
The Op returned by this function is what must be passed to the
`sess.run()` call to cause the model to train.
Args:
loss: Loss tensor, from loss().
global_step: Integer Variable counting the number of training steps
processed.
learning_rate: The learning rate to use for gradient descent.
Returns:
train_op: The Op for training.
"""
# Add a scalar summary for the snapshot loss.''
sol = hypes["solver"]
hypes['tensors'] = {}
hypes['tensors']['global_step'] = global_step
total_loss = loss['total_loss']
with tf.name_scope('training'):
if opt is None:
if sol['opt'] == 'RMS':
opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
decay=0.9,
epsilon=sol['epsilon'])
elif sol['opt'] == 'Adam':
opt = tf.train.AdamOptimizer(learning_rate=learning_rate,
epsilon=sol['adam_eps'])
elif sol['opt'] == 'SGD':
lr = learning_rate
opt = tf.train.GradientDescentOptimizer(learning_rate=lr)
else:
raise ValueError('Unrecognized opt type')
hypes['opt'] = opt
grads_and_vars = opt.compute_gradients(total_loss)
if hypes['clip_norm'] > 0:
grads, tvars = zip(*grads_and_vars)
clip_norm = hypes["clip_norm"]
clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
grads_and_vars = zip(clipped_grads, tvars)
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = opt.apply_gradients(grads_and_vars,
global_step=global_step)
return train_op

Seems like a bug in the gradient clipping section. I had the same problem, did some research on how to do it properly (see source below) and it seems to work now.
replace the section
grads, tvars = zip(*grads_and_vars)
clip_norm = hypes["clip_norm"]
clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
grads_and_vars = zip(clipped_grads, tvars)
with
clip_norm = hypes["clip_norm"]
grads_and_vars = [(tf.clip_by_value(grad, -clip_norm, clip_norm), var)
for grad, var in grads_and_vars]
and it should work.
source: How to effectively apply gradient clipping in tensor flow?

I believe that tf.clip_by_value have the different effect to the gradient values from tf.clip_by_global_norm.
Apparently tf.clip_by_value clips each gradient values independently into the clip range, while tf.clip_by_global_norm calculates total norm of all gradient values and rescale each value in the way that every gradient values will fit into the clip range, while preserve proportion between every gradient values.
To illustrate the different between the two functions, let's say we have
original gradients = [2.0, 1.0, 2.0]
tf.clip_by_value(gradients, -1.0, 1.0) will cause gradients to be [1.0, 1.0, 1.0]
tf.clip_by_global_norm(gradient, 1.0) will cause gradients to be [1.0, 0.5, 1.0]
To answer the original question, what works for me is that I have to convert zip object to list as below:
grads, tvars = zip(*grads_and_vars)
(clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
grads_and_vars = list(zip(clipped_grads, tvars))

Related

weights is not changed after applying weights in keras with tensorflow GradientTape

The following is the code:
with tf.GradientTape() as tape:
predictions = self.p_model(obs)
indices = tf.random.categorical(predictions, 1, dtype=tf.int32)
indices = tf.squeeze(indices)
#print('indices:', indices)
row_indices = tf.range(tf.shape(indices)[0], dtype=tf.int32)
full_indices = tf.stack([row_indices, indices], axis=1)
act_val = tf.gather_nd(predictions, full_indices)
target_preds1 = self.target_model(obs)
target_preds1 = tf.gather_nd(target_preds1, full_indices)
target_preds2 = self.target_model2(obs)
target_preds2 = tf.gather_nd(target_preds2, full_indices)
target_predict = tf.math.minimum(target_preds1, target_preds2)
loss_value = self.alpha*tf.math.log(act_val) - target_predict
loss_value = tf.reduce_mean(loss_value)
grads = tape.gradient(loss_value, self.p_model.trainable_weights)
self.p_optimizer.apply_gradients(
zip(grads, self.p_model.trainable_weights))
I check grads tensor. It's not zero values. However, trainable_weights is not changed after applying gradients. keras's version is 2.3.1 and tensorflow version is 1.15.0.
I try to enable eager execution, it works. Is it caused by keras or tensorflow version ?
I solve this problem. self.p_optimizer.apply_gradients return update_op. Get tf session from keras.back.get_session and run update_op.
grads = tape.gradient(final_loss, self.p_model.trainable_weights)
update_op = self.optimizer.apply_gradients(
zip(grads, self.p_model.trainable_weights))
keras.backend.get_session().run(update_op)
It works, but it's not necessary to run update_op in normal.

Tensorflow Eager Execution does not work with Learning Rate decay

trying here to make an eager exec model work with LR decay, but no success. It seems to be a bug, since it appear that the learning rate decay tensor does not get updated. If I am missing something can you land a hand here. Thanks.
The code bellow is learning some word embeddings. However, the learning rate decay section does not work at all.
class Word2Vec(tf.keras.Model):
def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
self.vocab_size = vocab_size
self.num_sampled = num_sampled
self.embed_matrix = tfe.Variable(tf.random_uniform(
[vocab_size, embed_size]), name="embedding_matrix")
self.nce_weight = tfe.Variable(tf.truncated_normal(
[vocab_size, embed_size],
stddev=1.0 / (embed_size ** 0.5)), name="weights")
self.nce_bias = tfe.Variable(tf.zeros([vocab_size]), name="biases")
def compute_loss(self, center_words, target_words):
"""Computes the forward pass of word2vec with the NCE loss."""
embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
biases=self.nce_bias,
labels=target_words,
inputs=embed,
num_sampled=self.num_sampled,
num_classes=self.vocab_size))
return loss
def gen():
yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES,
VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW,
VISUAL_FLD)
def main():
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32),
(tf.TensorShape([BATCH_SIZE]),
tf.TensorShape([BATCH_SIZE, 1])))
global_step = tf.train.get_or_create_global_step()
starter_learning_rate = 1.0
end_learning_rate = 0.01
decay_steps = 1000
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
decay_steps, end_learning_rate,
power=0.5)
train_writer = tf.contrib.summary.create_file_writer('./checkpoints')
train_writer.set_as_default()
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
total_loss = 0.0 # for average loss in the last SKIP_STEP steps
checkpoint_dir = "./checkpoints/"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
root = tfe.Checkpoint(optimizer=optimizer,
model=model,
optimizer_step=tf.train.get_or_create_global_step())
while global_step < NUM_TRAIN_STEPS:
for center_words, target_words in tfe.Iterator(dataset):
with tf.contrib.summary.record_summaries_every_n_global_steps(100):
if global_step >= NUM_TRAIN_STEPS:
break
loss_batch, grads = grad_fn(center_words, target_words)
tf.contrib.summary.scalar('loss', loss_batch)
tf.contrib.summary.scalar('learning_rate', learning_rate)
# print(grads)
# print(len(grads))
total_loss += loss_batch
optimizer.apply_gradients(grads, global_step)
if (global_step.numpy() + 1) % SKIP_STEP == 0:
print('Average loss at step {}: {:5.1f}'.format(
global_step.numpy(), total_loss / SKIP_STEP))
total_loss = 0.0
root.save(file_prefix=checkpoint_prefix)
if __name__ == '__main__':
main()
Note that when eager execution is enabled, the tf.Tensor objects represent concrete values (as opposed to symbolic handles of computation that will occur on Session.run() calls).
As a result, in your code snippet above, the line:
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
decay_steps, end_learning_rate,
power=0.5)
is computing the decayed value once, using the global_step at the time it was invoked, and when the optimizer is being created with:
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
it is being given a fixed learning rate.
To decay the learning rate, you'd want to invoke tf.train.polynomial_decay repeatedly (with updated values for global_step). One way to do this would be to replicate what is done in the RNN example, using something like this:
starter_learning_rate = 1.0
learning_rate = tfe.Variable(starter_learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
while global_step < NUM_TRAIN_STEPS:
# ....
learning_rate.assign(tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5))
This way you've captured the learning_rate in a variable that can be updated. Furthermore, it's simple to include the current learning_rate in the checkpoint as well (by including it when creating the Checkpoint object).
Hope that helps.

how to restore the learning rate in TF from previously saved checkpoint ?

I have stopped training at some point and saved checkpoint, meta files etc.
Now when I want to resume training, I want to start with last running learning rate of the optimizer. Can you provide a example of doing so ?
For those coming here (like me) wondering whether the last learning rate is automatically restored: tf.train.exponential_decay doesn't add any Variables to the graph, it only adds the operations necessary to derive the correct current learning rate value given a certain global_step value. This way, you only need to checkpoint the global_step value (which is done by default normally) and, assuming you keep the same initial learning rate, decay steps and decay factor, you'll automatically pick up training where you left it, with the correct learning rate value.
Inspecting the checkpoint won't show any learning_rate variable (or similar), simply because there is no need for any.
This example code learns to add two numbers:
import tensorflow as tf
import numpy as np
import os
save_ckpt_dir = './add_ckpt'
ckpt_filename = 'add.ckpt'
save_ckpt_path = os.path.join(save_ckpt_dir, ckpt_filename)
if not os.path.isdir(save_ckpt_dir):
os.mkdir(save_ckpt_dir)
if [fname.startswith("add.ckpt") for fname in os.listdir(save_ckpt_dir)]: # prefer to load pre-trained net
load_ckpt_path = save_ckpt_path
else:
load_ckpt_path = None # train from scratch
def add_layer(inputs, in_size, out_size, activation_fn=None):
Weights = tf.Variable(tf.ones([in_size, out_size]), name='Weights')
biases = tf.Variable(tf.zeros([1, out_size]), name='biases')
Wx_plus_b = tf.add(tf.matmul(inputs, Weights), biases)
if activation_fn is None:
layer_output = Wx_plus_b
else:
layer_output = activation_fn(Wx_plus_b)
return layer_output
def produce_batch(batch_size=256):
"""Loads a single batch of data.
Args:
batch_size: The number of excersises in the batch.
Returns:
x : column vector of numbers
y : another column of numbers
xy_sum : the sum of the columns
"""
x = np.random.random(size=[batch_size, 1]) * 10
y = np.random.random(size=[batch_size, 1]) * 10
xy_sum = x + y
return x, y, xy_sum
with tf.name_scope("inputs"):
xs = tf.placeholder(tf.float32, [None, 1])
ys = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("correct_labels"):
xysums = tf.placeholder(tf.float32, [None, 1])
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
with tf.name_scope("graph_body"):
prediction = add_layer(tf.concat([xs, ys], 1), 2, 1, activation_fn=None)
with tf.name_scope("loss_and_train"):
# the error between prediction and real data
loss = tf.reduce_mean(tf.reduce_sum(tf.square(xysums-prediction), reduction_indices=[1]))
# Passing global_step to minimize() will increment it at each step.
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
with tf.name_scope("init_load_save"):
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init)
if load_ckpt_path:
saver.restore(sess, load_ckpt_path)
for i in range(1000):
x, y, xy_sum = produce_batch(256)
_, global_step_np, loss_np, lr_np = sess.run([train_step, global_step, loss, lr], feed_dict={xs: x, ys: y, xysums: xy_sum})
if global_step_np % 100 == 0:
print("global step: {}, loss: {}, learning rate: {}".format(global_step_np, loss_np, lr_np))
saver.save(sess, save_ckpt_path)
if you run it a few times, you will see the learning rate decrease. It also saves the global step. The trick is here:
with tf.name_scope("step_and_learning_rate"):
global_step = tf.Variable(0, trainable=False)
lr = tf.train.exponential_decay(0.15, global_step, 10, 0.96) # start lr=0.15, decay every 10 steps with a base of 0.96
...
train_step = tf.train.AdamOptimizer(lr).minimize(loss, global_step=global_step)
By default, saver.save will save all savable objects (including learning rate and global step). However, if tf.train.Saver is provided with var_list, saver.save will only save the vars included in var_list:
saver = tf.train.Saver(var_list = ..list of vars to save..)
sources:
https://www.tensorflow.org/api_docs/python/tf/train/exponential_decay
https://stats.stackexchange.com/questions/200063/tensorflow-adam-optimizer-with-exponential-decay
https://www.tensorflow.org/api_docs/python/tf/train/Saver (see "saveable objects")

Compute gradient norm of each part of composite loss function

Assume I have the following loss function:
loss_a = tf.reduce_mean(my_loss_fn(model_output, targets))
loss_b = tf.reduce_mean(my_other_loss_fn(model_output, targets))
loss_final = loss_a + tf.multiply(alpha, loss_b)
To visualize the norm of the gradients w.r.t to loss_final one could do this:
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
grads_and_vars = optimizer.compute_gradients(loss_final)
grads, _ = list(zip(*grads_and_vars))
norms = tf.global_norm(grads)
gradnorm_s = tf.summary.scalar('gradient norm', norms)
train_op = optimizer.apply_gradients(grads_and_vars, name='train_op')
However, I would like to plot the norm of the gradients w.r.t to loss_a and loss_b separately. How can I do this in the most efficient way? Do I have to call compute_gradients(..) on both loss_a and loss_b separately and then add those two gradients together before passing them to optimizer.apply_gradients(..)? I know that this would mathematically be correct due to the summation rule, but it just seems a bit cumbersome and I also don't know how you would implement the summation of the gradients correctly. Also, loss_final is rather simple, because it's just a summation. What if loss_final was more complicated, e.g. a division?
I'm using Tensorflow 0.12.
You are right that combining gradients could get messy. Instead just compute the gradients of each of the losses as well as the final loss. Because tensorflow optimizes the directed acyclic graph (DAG) before compilation, this doesn't result in duplication of work.
For example:
import tensorflow as tf
with tf.name_scope('inputs'):
W = tf.Variable(dtype=tf.float32, initial_value=tf.random_normal((4, 1), dtype=tf.float32), name='W')
x = tf.random_uniform((6, 4), dtype=tf.float32, name='x')
with tf.name_scope('outputs'):
y = tf.matmul(x, W, name='y')
def my_loss_fn(output, targets, name):
return tf.reduce_mean(tf.abs(output - targets), name=name)
def my_other_loss_fn(output, targets, name):
return tf.sqrt(tf.reduce_mean((output - targets) ** 2), name=name)
def get_tensors(loss_fn):
loss = loss_fn(y, targets, 'loss')
grads = tf.gradients(loss, W, name='gradients')
norm = tf.norm(grads, name='norm')
return loss, grads, norm
targets = tf.random_uniform((6, 1))
with tf.name_scope('a'):
loss_a, grads_a, norm_a = get_tensors(my_loss_fn)
with tf.name_scope('b'):
loss_b, grads_b, norm_b = get_tensors(my_loss_fn)
with tf.name_scope('combined'):
loss = tf.add(loss_a, loss_b, name='loss')
grad = tf.gradients(loss, W, name='gradients')
with tf.Session() as sess:
tf.global_variables_initializer().run(session=sess)
writer = tf.summary.FileWriter('./tensorboard_results', sess.graph)
res = sess.run([norm_a, norm_b, grad])
print(*res, sep='\n')
Edit: In response to your comment... You can check the DAG of a tensorflow model using tensorboard. I've updated the code to store the graph.
Run tensorboard --logdir $PWD/tensorboard_results in a terminal and navigate to the url printed on the commandline (typically http://localhost:6006/). Then click on GRAPH tab to view the DAG. You can recursively expand the tensors, ops, namespaces to see subgraphs to see individual operations and their inputs.

How to use Batch Normalization correctly in tensorflow?

I had tried several versions of batch_normalization in tensorflow, but none of them worked! The results were all incorrect when I set batch_size = 1 at inference time.
Version 1: directly use the official version in tensorflow.contrib
from tensorflow.contrib.layers.python.layers.layers import batch_norm
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
Version 2: from How could I use Batch Normalization in TensorFlow?
def batch_norm_layer(x, train_phase, scope_bn='bn'):
bn_train = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training is a placeholder at training time is True and False at inference time.
version 3: from slim https://github.com/tensorflow/models/blob/master/inception/inception/slim/ops.py
def batch_norm_layer(inputs,
is_training=True,
scope='bn'):
decay=0.999
epsilon=0.001
inputs_shape = inputs.get_shape()
with tf.variable_scope(scope) as t_scope:
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
beta = tf.Variable(tf.zeros_initializer(params_shape),
name='beta',
trainable=True)
gamma = tf.Variable(tf.ones_initializer(params_shape),
name='gamma',
trainable=True)
moving_mean = tf.Variable(tf.zeros_initializer(params_shape),
name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones_initializer(params_shape),
name='moving_variance',
trainable=False)
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
outputs = tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
outputs.set_shape(inputs.get_shape())
return outputs
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
version 4: like version3, but add tf.control_dependencies
def batch_norm_layer(inputs,
decay=0.999,
center=True,
scale=True,
epsilon=0.001,
moving_vars='moving_vars',
activation=None,
is_training=True,
trainable=True,
restore=True,
scope='bn',
reuse=None):
inputs_shape = inputs.get_shape()
with tf.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse):
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta = tf.Variable(tf.zeros(params_shape), name='beta')
gamma = tf.Variable(tf.ones(params_shape), name='gamma')
# Create moving_mean and moving_variance add them to
# GraphKeys.MOVING_AVERAGE_VARIABLES collections.
moving_mean = tf.Variable(tf.zeros(params_shape), name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones(params_shape), name='moving_variance',
trainable=False)
control_inputs = []
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
control_inputs = [update_moving_mean, update_moving_variance]
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
with tf.control_dependencies(control_inputs):
return tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
The 4 versions of Batch_normalization are all not correct. So, how to use batch normalization correctly?
Another strange phenomenon is if I set batch_norm_layer to null like this, the inference result are all same.
def batch_norm_layer(inputs, is_training):
return inputs
I have tested that the following simplified implementation of batch normalization gives the same result as tf.contrib.layers.batch_norm as long as the setting is the same.
def initialize_batch_norm(scope, depth):
with tf.variable_scope(scope) as bnscope:
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0))
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0))
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
bnscope.reuse_variables()
def BatchNorm_layer(x, scope, train, epsilon=0.001, decay=.99):
# Perform a batch normalization after a conv layer or a fc layer
# gamma: a scale factor
# beta: an offset
# epsilon: the variance epsilon - a small float number to avoid dividing by 0
with tf.variable_scope(scope, reuse=True):
with tf.variable_scope('BatchNorm', reuse=True) as bnscope:
gamma, beta = tf.get_variable("gamma"), tf.get_variable("beta")
moving_avg, moving_var = tf.get_variable("moving_avg"), tf.get_variable("moving_var")
shape = x.get_shape().as_list()
control_inputs = []
if train:
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
The main tips with using the official implementation of batch normalization in tf.contrib.layers.batch_norm are: (1) set is_training=True for training time and is_training=False for validation and testing time; (2) set updates_collections=None to make sure that moving_variance and moving_mean are updated in place; (3) be aware and careful with the scope setting; (4) set decay to be a smaller value (decay=0.9 or decay=0.99) than default value (default is 0.999) if your dataset is small or your total training updates/steps are not that large.
I found the Zhongyu Kuang's code really useful, but I stuck on how to dynamically switch between train and test ops, i.e. how to move from a python boolean is_training to a tensorflow boolean placeholder is_training. I need this functionality to be able to test the network on the validation set during the training.
Starting from his code and inspired by this, I wrote the following code:
def batch_norm(x, scope, is_training, epsilon=0.001, decay=0.99):
"""
Returns a batch normalization layer that automatically switch between train and test phases based on the
tensor is_training
Args:
x: input tensor
scope: scope name
is_training: boolean tensor or variable
epsilon: epsilon parameter - see batch_norm_layer
decay: epsilon parameter - see batch_norm_layer
Returns:
The correct batch normalization layer based on the value of is_training
"""
assert isinstance(is_training, (ops.Tensor, variables.Variable)) and is_training.dtype == tf.bool
return tf.cond(
is_training,
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=True, reuse=None),
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=False, reuse=True),
)
def batch_norm_layer(x, scope, is_training, epsilon=0.001, decay=0.99, reuse=None):
"""
Performs a batch normalization layer
Args:
x: input tensor
scope: scope name
is_training: python boolean value
epsilon: the variance epsilon - a small float number to avoid dividing by 0
decay: the moving average decay
Returns:
The ops of a batch normalization layer
"""
with tf.variable_scope(scope, reuse=reuse):
shape = x.get_shape().as_list()
# gamma: a trainable scale factor
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0), trainable=True)
# beta: a trainable shift value
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0), trainable=True)
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
if is_training:
# tf.nn.moments == Calculate the mean and the variance of the tensor x
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
control_inputs = []
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
Then I use the batch_norm layer in this way:
fc1_weights = tf.Variable(...)
fc1 = tf.matmul(x, fc1_weights)
fc1 = batch_norm(fc1, 'fc1_bn', is_training=is_training)
fc1 = tf.nn.relu(fc1)
Where is_training is a boolean placeholder. Note that the bias addition is not needed because is replaced by the beta parameter as explained in the Batch Normalization paper.
During execution:
# Training phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: True})
# Testing phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: False})