Using tf.train.exponential_decay with predefined estimator? - tensorflow

I am trying to use tf.train.exponential_decay with predefined estimators and this is proving to be super difficult for some reason. Am I missing something here?
Here is my old code with constant learning_rate:
classifier = tf.estimator.DNNRegressor(
feature_columns=f_columns,
model_dir='./TF',
hidden_units=[2, 2],
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=0.50,
l1_regularization_strength=0.001,
))
Now I tried adding this:
starter_learning_rate = 0.50
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
10000, 0.96, staircase=True)
but now what?
estimator.predict() does not accept global_step so it will be stuck at 0?
Even if I pass learning_rate to tf.train.ProximalAdagradOptimizer() I get an error saying
"ValueError: Tensor("ExponentialDecay:0", shape=(), dtype=float32)
must be from the same graph as
Tensor("dnn/hiddenlayer_0/kernel/part_0:0", shape=(62, 2),
dtype=float32_ref)."
Your help is greatly appreciated. I am using TF1.6 btw.

you should let optimizer under mode == tf.estimator.ModeKeys.TRAIN
here is sample code
def _model_fn(features, labels, mode, config):
# xxxxxxxxx
# xxxxxxxxx
assert mode == tf.estimator.ModeKeys.TRAIN
global_step = tf.train.get_global_step()
decay_learning_rate = tf.train.exponential_decay(learning_rate, global_step, 100, 0.98, staircase=True)
optimizer = adagrad.AdagradOptimizer(decay_learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_chief_hooks=chief_hooks, eval_metric_ops=metrics)

Related

Facing error in tensorflow2.0 while building tf_estimator

import custom_model as CM
import input_pipeline as IP
import tensorflow as tf
def custom_estimator(features, labels, mode):
logits = CM.model_net(features=features, n_classes=5)
prediction = tf.keras.layers.Activation('softmax')(logits)
preds_dict = {'class': tf.argmax(input=prediction, axis=1),
'probabilities': prediction,
'logits': logits}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode,
predictions=preds_dict)
# Compute loss
labels = tf.reshape(labels, (BATCH_SIZE, 5))
loss = tf.keras.losses.categorical_crossentropy(y_true=labels,
y_pred=prediction)
# Compute evaluation metrics
accuracy = custom_accuracy(labels, prediction)
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy)
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss,
eval_metric_ops=metrics)
optimizer = tf.keras.optimizers.Adam()
train_op = optimizer.minimize(loss)
return tf.estimator.EstimatorSpec(mode, loss=loss,
train_op=train_op)
# Build tf_estimator
classifier = tf.estimator.Estimator(model_fn=custom_estimator,
model_dir=model_dir)
# Train the estimator
TRAIN_FILES, TRAIN_LABELS = IP.map_file_to_label(data_dir=data_dir)
TRAIN = classifier.train(input_fn=lambda:
IP.imgs_input_fn(TRAIN_FILES, labels=TRAIN_LABELS,
perform_shuffle=True, repeat_count=EPOCHS,
batch_size=BATCH_SIZE),
steps=int(len(TRAIN_LABELS)/BATCH_SIZE))
This is the error I am facing with TensorFlow-2.0. Error image & code for the same attached here. Please help.
If I enter var_list=None then the error is "ValueError: Passed in object of type , not tf.Tensor"
First of all, I don't think this Estimator code example is TensorFlow 2.0 compliant. In any case, if peradventure you're using the 1.x versions, replace:
train_op = optimizer.minimize(loss)
with this:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.train.get_global_step())
If indeed, you're using TensorFlow 2.0, then replace with:
train_op = optimizer.minimize(
loss=average_loss, global_step=tf.compat.v1.train.get_global_step())

How to get predictions from a tf.estimator trained CNN

I have trained a CNN using the tf.estimator API, but am having trouble getting predictions out in a way that is useful to me.
I need to feed images to my CNN in real time as they are received from a camera. In an older net design I made the Controller_tf class, which worked fine for doing that. So I have tried to adapt it to a new CNN trained using tf.estimator (as said earlier).
The estimator.predict interface seems to want to be invoked via a tf.app.run() call (would be glad to be proved wrong about that), which is why I am trying to run the CNN using tf.Session() (with if statements inside the model function to only run the relevant parts) but I'm currently getting the error:
ValueError: Fetch argument 'infer' cannot be interpreted as a Tensor. ("The name 'infer' refers to an Operation not in the graph.")
I can't quite see where I am going wrong. Is the trained model incompatable with the run in PREDICT mode? Any help will be very much appreciated. Any way here is the code:
class Controller_tf:
set_speed = None
def __init__(self, model, ckpt_path, set_speed_in):
self.set_speed = set_speed_in
self.x = tf.placeholder(tf.float32, shape = (None, 104, 160, 3))
self.y = model(self.x, None, tf.estimator.ModeKeys.PREDICT)
# make TF use memory growth method
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self.sess = tf.Session(config=config)
saver = tf.train.Saver()
saver.restore(self.sess, ckpt_path)
def update(self, message):
# The current speed of the car
image = frame2numpy(message['frame'], (160,104))
image_array = np.asarray(image)
turn_logits = self.sess.run(self.y, {self.x: image_array[None, :, :, :]})
return turn_logits
model = cnn_model_fn3
ckpt = 'ckpts/stc_model3/model.ckpt-27621'
controller = Controller_tf(model, ckpt, 18)
image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0008465.jpg'
#image_file = 'G:/Datasets/ds072.001/ds072.001-fm-0009156.jpg'
satnavimg = load_image(image_file)
satnavimg = np.asarray([satnavimg])
satnavimg = (satnavimg/127.5) - 1.0
print(np.shape(satnavimg))
msg = {'frame': satnavimg}
turn = controller.update(msg)
print(turn)
The model function is:
def cnn_model_fn3(features, labels, mode):
if mode == tf.estimator.ModeKeys.PREDICT:
input_layer = features
else:
input_layer = tf.reshape(features["image_data"], [-1, 104, 160, 3])
conv1 = tf.layers.conv2d(
inputs=input_layer,
filters=32,
kernel_size=[10, 10],
padding="same",
activation=tf.nn.relu,
name='Conv1')
... removed layer code for brevity ...
logits = tf.layers.dense(
inputs=dropout1,
units=3,
name='Dense3')
predictions = {
"classes": tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
if mode == tf.estimator.ModeKeys.PREDICT:
return logits
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

Tensorflow: Using Batch Normalization gives poor (erratic) validation loss and accuracy

I am trying to use Batch Normalization using tf.layers.batch_normalization() and my code looks like this:
def create_conv_exp_model(fingerprint_input, model_settings, is_training):
# Dropout placeholder
if is_training:
dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
# Mode placeholder
mode_placeholder = tf.placeholder(tf.bool, name="mode_placeholder")
he_init = tf.contrib.layers.variance_scaling_initializer(mode="FAN_AVG")
# Input Layer
input_frequency_size = model_settings['bins']
input_time_size = model_settings['spectrogram_length']
net = tf.reshape(fingerprint_input,
[-1, input_time_size, input_frequency_size, 1],
name="reshape")
net = tf.layers.batch_normalization(net,
training=mode_placeholder,
name='bn_0')
for i in range(1, 6):
net = tf.layers.conv2d(inputs=net,
filters=8*(2**i),
kernel_size=[5, 5],
padding='same',
kernel_initializer=he_init,
name="conv_%d"%i)
net = tf.layers.batch_normalization(net,
training=mode_placeholder,
name='bn_%d'%i)
with tf.name_scope("relu_%d"%i):
net = tf.nn.relu(net)
net = tf.layers.max_pooling2d(net, [2, 2], [2, 2], 'SAME',
name="maxpool_%d"%i)
net_shape = net.get_shape().as_list()
net_height = net_shape[1]
net_width = net_shape[2]
net = tf.layers.conv2d( inputs=net,
filters=1024,
kernel_size=[net_height, net_width],
strides=(net_height, net_width),
padding='same',
kernel_initializer=he_init,
name="conv_f")
net = tf.layers.batch_normalization( net,
training=mode_placeholder,
name='bn_f')
with tf.name_scope("relu_f"):
net = tf.nn.relu(net)
net = tf.layers.conv2d( inputs=net,
filters=model_settings['label_count'],
kernel_size=[1, 1],
padding='same',
kernel_initializer=he_init,
name="conv_l")
### Squeeze
squeezed = tf.squeeze(net, axis=[1, 2], name="squeezed")
if is_training:
return squeezed, dropout_prob, mode_placeholder
else:
return squeezed, mode_placeholder
And my train step looks like this:
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate_input)
gvs = optimizer.compute_gradients(cross_entropy_mean)
capped_gvs = [(tf.clip_by_value(grad, -2., 2.), var) for grad, var in gvs]
train_step = optimizer.apply_gradients(gvs))
During training, I am feeding the graph with:
train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
[
merged_summaries, evaluation_step, cross_entropy_mean, train_step,
increment_global_step
],
feed_dict={
fingerprint_input: train_fingerprints,
ground_truth_input: train_ground_truth,
learning_rate_input: learning_rate_value,
dropout_prob: 0.5,
mode_placeholder: True
})
During validation,
validation_summary, validation_accuracy, conf_matrix = sess.run(
[merged_summaries, evaluation_step, confusion_matrix],
feed_dict={
fingerprint_input: validation_fingerprints,
ground_truth_input: validation_ground_truth,
dropout_prob: 1.0,
mode_placeholder: False
})
My loss and accuracy curves (orange is training, blue is validation):
Plot of loss vs number of iterations,
Plot of accuracy vs number of iterations
The validation loss (and accuracy) seem very erratic. Is my implementation of Batch Normalization wrong? Or is this normal with Batch Normalization and I should wait for more iterations?
You need to pass is_training to tf.layers.batch_normalization(..., training=is_training) or it tries to normalize the inference minibatches using the minibatch statistics instead of the training statistics, which is wrong.
There are mainly two things to check.
1. Are you sure that you are using batch normalization (BN) correctly in the train op?
If you read the layer documentation:
Note: when training, the moving_mean and moving_variance need to be updated.
By default the update ops are placed in tf.GraphKeys.UPDATE_OPS, so they
need to be added as a dependency to the train_op. Also, be sure to add
any batch_normalization ops before getting the update_ops collection.
Otherwise, update_ops will be empty, and training/inference will not work
properly.
For example:
x_norm = tf.layers.batch_normalization(x, training=training)
# ...
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss)
2. Otherwise, try lowering the "momentum" in the BN.
During the training, in fact, the BN uses two moving averages of the mean and the variance that are supposed to approximate the population statistics. Mean and variance are initialized to 0 and 1 respectively and then, step by step, they are multiplied by the momentum value (default is 0.99) and added the new value*0.01. At inference (test) time, the normalization uses these statistics. For this reason, it takes these values a little while to arrive at the "real" mean and variance of the data.
Source:
https://www.tensorflow.org/api_docs/python/tf/layers/batch_normalization
https://github.com/keras-team/keras/issues/7265
https://github.com/keras-team/keras/issues/3366
The original BN paper can be found here:
https://arxiv.org/abs/1502.03167
I also observed oscillations in validation loss when adding batch norm before ReLU. We found that moving the batch norm after the ReLU resolved the issue.

zip object for opt.apply_gradients

There is a program including an optimiziton function, which has following code segment to compute gradient
if hypes['clip_norm'] > 0:
grads, tvars = zip(*grads_and_vars)
clip_norm = hypes["clip_norm"]
clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
grads_and_vars = zip(clipped_grads, tvars)
print('grads_and_vars ',grads_and_vars)
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = (grads_and_vars,
global_step=global_step)
However, running the program raises the following error
File "/home/FCN/kittiseg/hypes/../optimizer/generic_optimizer.py", line 92, in training
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
File "tensorflow/tf_0.12/lib/python3.4/site-packages/tensorflow/python/training/optimizer.py", line 370, in apply_gradients
raise ValueError("No variables provided.")
ValueError: No variables provided.
I digged into the code, and think it is caused by the variable grads_and_var. I printed it out, which is just grads_and_vars <zip object at 0x2b0d6c27e348>. But I don't know how to analyze it and what causes the
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
fail?
This is the original training function
def training(hypes, loss, global_step, learning_rate, opt=None):
"""Sets up the training Ops.
Creates a summarizer to track the loss over time in TensorBoard.
Creates an optimizer and applies the gradients to all trainable variables.
The Op returned by this function is what must be passed to the
`sess.run()` call to cause the model to train.
Args:
loss: Loss tensor, from loss().
global_step: Integer Variable counting the number of training steps
processed.
learning_rate: The learning rate to use for gradient descent.
Returns:
train_op: The Op for training.
"""
# Add a scalar summary for the snapshot loss.''
sol = hypes["solver"]
hypes['tensors'] = {}
hypes['tensors']['global_step'] = global_step
total_loss = loss['total_loss']
with tf.name_scope('training'):
if opt is None:
if sol['opt'] == 'RMS':
opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
decay=0.9,
epsilon=sol['epsilon'])
elif sol['opt'] == 'Adam':
opt = tf.train.AdamOptimizer(learning_rate=learning_rate,
epsilon=sol['adam_eps'])
elif sol['opt'] == 'SGD':
lr = learning_rate
opt = tf.train.GradientDescentOptimizer(learning_rate=lr)
else:
raise ValueError('Unrecognized opt type')
hypes['opt'] = opt
grads_and_vars = opt.compute_gradients(total_loss)
if hypes['clip_norm'] > 0:
grads, tvars = zip(*grads_and_vars)
clip_norm = hypes["clip_norm"]
clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
grads_and_vars = zip(clipped_grads, tvars)
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = opt.apply_gradients(grads_and_vars,
global_step=global_step)
return train_op
Seems like a bug in the gradient clipping section. I had the same problem, did some research on how to do it properly (see source below) and it seems to work now.
replace the section
grads, tvars = zip(*grads_and_vars)
clip_norm = hypes["clip_norm"]
clipped_grads, norm = tf.clip_by_global_norm(grads, clip_norm)
grads_and_vars = zip(clipped_grads, tvars)
with
clip_norm = hypes["clip_norm"]
grads_and_vars = [(tf.clip_by_value(grad, -clip_norm, clip_norm), var)
for grad, var in grads_and_vars]
and it should work.
source: How to effectively apply gradient clipping in tensor flow?
I believe that tf.clip_by_value have the different effect to the gradient values from tf.clip_by_global_norm.
Apparently tf.clip_by_value clips each gradient values independently into the clip range, while tf.clip_by_global_norm calculates total norm of all gradient values and rescale each value in the way that every gradient values will fit into the clip range, while preserve proportion between every gradient values.
To illustrate the different between the two functions, let's say we have
original gradients = [2.0, 1.0, 2.0]
tf.clip_by_value(gradients, -1.0, 1.0) will cause gradients to be [1.0, 1.0, 1.0]
tf.clip_by_global_norm(gradient, 1.0) will cause gradients to be [1.0, 0.5, 1.0]
To answer the original question, what works for me is that I have to convert zip object to list as below:
grads, tvars = zip(*grads_and_vars)
(clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
grads_and_vars = list(zip(clipped_grads, tvars))

How to use Batch Normalization correctly in tensorflow?

I had tried several versions of batch_normalization in tensorflow, but none of them worked! The results were all incorrect when I set batch_size = 1 at inference time.
Version 1: directly use the official version in tensorflow.contrib
from tensorflow.contrib.layers.python.layers.layers import batch_norm
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
Version 2: from How could I use Batch Normalization in TensorFlow?
def batch_norm_layer(x, train_phase, scope_bn='bn'):
bn_train = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training is a placeholder at training time is True and False at inference time.
version 3: from slim https://github.com/tensorflow/models/blob/master/inception/inception/slim/ops.py
def batch_norm_layer(inputs,
is_training=True,
scope='bn'):
decay=0.999
epsilon=0.001
inputs_shape = inputs.get_shape()
with tf.variable_scope(scope) as t_scope:
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
beta = tf.Variable(tf.zeros_initializer(params_shape),
name='beta',
trainable=True)
gamma = tf.Variable(tf.ones_initializer(params_shape),
name='gamma',
trainable=True)
moving_mean = tf.Variable(tf.zeros_initializer(params_shape),
name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones_initializer(params_shape),
name='moving_variance',
trainable=False)
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
outputs = tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
outputs.set_shape(inputs.get_shape())
return outputs
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
version 4: like version3, but add tf.control_dependencies
def batch_norm_layer(inputs,
decay=0.999,
center=True,
scale=True,
epsilon=0.001,
moving_vars='moving_vars',
activation=None,
is_training=True,
trainable=True,
restore=True,
scope='bn',
reuse=None):
inputs_shape = inputs.get_shape()
with tf.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse):
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta = tf.Variable(tf.zeros(params_shape), name='beta')
gamma = tf.Variable(tf.ones(params_shape), name='gamma')
# Create moving_mean and moving_variance add them to
# GraphKeys.MOVING_AVERAGE_VARIABLES collections.
moving_mean = tf.Variable(tf.zeros(params_shape), name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones(params_shape), name='moving_variance',
trainable=False)
control_inputs = []
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
control_inputs = [update_moving_mean, update_moving_variance]
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
with tf.control_dependencies(control_inputs):
return tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
The 4 versions of Batch_normalization are all not correct. So, how to use batch normalization correctly?
Another strange phenomenon is if I set batch_norm_layer to null like this, the inference result are all same.
def batch_norm_layer(inputs, is_training):
return inputs
I have tested that the following simplified implementation of batch normalization gives the same result as tf.contrib.layers.batch_norm as long as the setting is the same.
def initialize_batch_norm(scope, depth):
with tf.variable_scope(scope) as bnscope:
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0))
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0))
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
bnscope.reuse_variables()
def BatchNorm_layer(x, scope, train, epsilon=0.001, decay=.99):
# Perform a batch normalization after a conv layer or a fc layer
# gamma: a scale factor
# beta: an offset
# epsilon: the variance epsilon - a small float number to avoid dividing by 0
with tf.variable_scope(scope, reuse=True):
with tf.variable_scope('BatchNorm', reuse=True) as bnscope:
gamma, beta = tf.get_variable("gamma"), tf.get_variable("beta")
moving_avg, moving_var = tf.get_variable("moving_avg"), tf.get_variable("moving_var")
shape = x.get_shape().as_list()
control_inputs = []
if train:
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
The main tips with using the official implementation of batch normalization in tf.contrib.layers.batch_norm are: (1) set is_training=True for training time and is_training=False for validation and testing time; (2) set updates_collections=None to make sure that moving_variance and moving_mean are updated in place; (3) be aware and careful with the scope setting; (4) set decay to be a smaller value (decay=0.9 or decay=0.99) than default value (default is 0.999) if your dataset is small or your total training updates/steps are not that large.
I found the Zhongyu Kuang's code really useful, but I stuck on how to dynamically switch between train and test ops, i.e. how to move from a python boolean is_training to a tensorflow boolean placeholder is_training. I need this functionality to be able to test the network on the validation set during the training.
Starting from his code and inspired by this, I wrote the following code:
def batch_norm(x, scope, is_training, epsilon=0.001, decay=0.99):
"""
Returns a batch normalization layer that automatically switch between train and test phases based on the
tensor is_training
Args:
x: input tensor
scope: scope name
is_training: boolean tensor or variable
epsilon: epsilon parameter - see batch_norm_layer
decay: epsilon parameter - see batch_norm_layer
Returns:
The correct batch normalization layer based on the value of is_training
"""
assert isinstance(is_training, (ops.Tensor, variables.Variable)) and is_training.dtype == tf.bool
return tf.cond(
is_training,
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=True, reuse=None),
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=False, reuse=True),
)
def batch_norm_layer(x, scope, is_training, epsilon=0.001, decay=0.99, reuse=None):
"""
Performs a batch normalization layer
Args:
x: input tensor
scope: scope name
is_training: python boolean value
epsilon: the variance epsilon - a small float number to avoid dividing by 0
decay: the moving average decay
Returns:
The ops of a batch normalization layer
"""
with tf.variable_scope(scope, reuse=reuse):
shape = x.get_shape().as_list()
# gamma: a trainable scale factor
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0), trainable=True)
# beta: a trainable shift value
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0), trainable=True)
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
if is_training:
# tf.nn.moments == Calculate the mean and the variance of the tensor x
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
control_inputs = []
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
Then I use the batch_norm layer in this way:
fc1_weights = tf.Variable(...)
fc1 = tf.matmul(x, fc1_weights)
fc1 = batch_norm(fc1, 'fc1_bn', is_training=is_training)
fc1 = tf.nn.relu(fc1)
Where is_training is a boolean placeholder. Note that the bias addition is not needed because is replaced by the beta parameter as explained in the Batch Normalization paper.
During execution:
# Training phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: True})
# Testing phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: False})