Tensorflow Eager Execution does not work with Learning Rate decay - tensorflow

trying here to make an eager exec model work with LR decay, but no success. It seems to be a bug, since it appear that the learning rate decay tensor does not get updated. If I am missing something can you land a hand here. Thanks.
The code bellow is learning some word embeddings. However, the learning rate decay section does not work at all.
class Word2Vec(tf.keras.Model):
def __init__(self, vocab_size, embed_size, num_sampled=NUM_SAMPLED):
self.vocab_size = vocab_size
self.num_sampled = num_sampled
self.embed_matrix = tfe.Variable(tf.random_uniform(
[vocab_size, embed_size]), name="embedding_matrix")
self.nce_weight = tfe.Variable(tf.truncated_normal(
[vocab_size, embed_size],
stddev=1.0 / (embed_size ** 0.5)), name="weights")
self.nce_bias = tfe.Variable(tf.zeros([vocab_size]), name="biases")
def compute_loss(self, center_words, target_words):
"""Computes the forward pass of word2vec with the NCE loss."""
embed = tf.nn.embedding_lookup(self.embed_matrix, center_words)
loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
biases=self.nce_bias,
labels=target_words,
inputs=embed,
num_sampled=self.num_sampled,
num_classes=self.vocab_size))
return loss
def gen():
yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES,
VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW,
VISUAL_FLD)
def main():
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32),
(tf.TensorShape([BATCH_SIZE]),
tf.TensorShape([BATCH_SIZE, 1])))
global_step = tf.train.get_or_create_global_step()
starter_learning_rate = 1.0
end_learning_rate = 0.01
decay_steps = 1000
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
decay_steps, end_learning_rate,
power=0.5)
train_writer = tf.contrib.summary.create_file_writer('./checkpoints')
train_writer.set_as_default()
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
model = Word2Vec(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE)
grad_fn = tfe.implicit_value_and_gradients(model.compute_loss)
total_loss = 0.0 # for average loss in the last SKIP_STEP steps
checkpoint_dir = "./checkpoints/"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
root = tfe.Checkpoint(optimizer=optimizer,
model=model,
optimizer_step=tf.train.get_or_create_global_step())
while global_step < NUM_TRAIN_STEPS:
for center_words, target_words in tfe.Iterator(dataset):
with tf.contrib.summary.record_summaries_every_n_global_steps(100):
if global_step >= NUM_TRAIN_STEPS:
break
loss_batch, grads = grad_fn(center_words, target_words)
tf.contrib.summary.scalar('loss', loss_batch)
tf.contrib.summary.scalar('learning_rate', learning_rate)
# print(grads)
# print(len(grads))
total_loss += loss_batch
optimizer.apply_gradients(grads, global_step)
if (global_step.numpy() + 1) % SKIP_STEP == 0:
print('Average loss at step {}: {:5.1f}'.format(
global_step.numpy(), total_loss / SKIP_STEP))
total_loss = 0.0
root.save(file_prefix=checkpoint_prefix)
if __name__ == '__main__':
main()

Note that when eager execution is enabled, the tf.Tensor objects represent concrete values (as opposed to symbolic handles of computation that will occur on Session.run() calls).
As a result, in your code snippet above, the line:
learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step.numpy(),
decay_steps, end_learning_rate,
power=0.5)
is computing the decayed value once, using the global_step at the time it was invoked, and when the optimizer is being created with:
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
it is being given a fixed learning rate.
To decay the learning rate, you'd want to invoke tf.train.polynomial_decay repeatedly (with updated values for global_step). One way to do this would be to replicate what is done in the RNN example, using something like this:
starter_learning_rate = 1.0
learning_rate = tfe.Variable(starter_learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.95)
while global_step < NUM_TRAIN_STEPS:
# ....
learning_rate.assign(tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5))
This way you've captured the learning_rate in a variable that can be updated. Furthermore, it's simple to include the current learning_rate in the checkpoint as well (by including it when creating the Checkpoint object).
Hope that helps.

Related

Need help in compiling custom loss

I am adding a custom loss to a VAE, as suggested here: https://www.linkedin.com/pulse/supervised-variational-autoencoder-code-included-ibrahim-sobh-phd/
Instead of defining a loss function, it uses a dense network and takes its output as the loss (if I understand correctly).
# New: add a classifier
clf_latent_inputs = Input(shape=(latent_dim,), name='z_sampling_clf')
clf_outputs = Dense(10, activation='softmax', name='class_output')(clf_latent_inputs)
clf_supervised = Model(clf_latent_inputs, clf_outputs, name='clf')
clf_supervised.summary()
# instantiate VAE model
# New: Add another output
outputs = [decoder(encoder(inputs)[2]), clf_supervised(encoder(inputs)[2])]
vae = Model(inputs, outputs, name='vae_mlp')
vae.summary()
reconstruction_loss = binary_crossentropy(inputs, outputs[0])
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean((reconstruction_loss + kl_loss) /100.0)
vae.add_loss(vae_loss)
# New: add the clf loss
vae.compile(optimizer='adam', loss={'clf': 'categorical_crossentropy'}) ===> this line <===
vae.summary()
# reconstruction_loss = binary_crossentropy(inputs, outputs)
svae_history = vae.fit(x_train, {'clf': y_train},
epochs=epochs,
batch_size=batch_size)
I was stuck at the compilation step (annotated as ===> this line <===) that I met a type error:
TypeError: Expected float32, got <function
BaseProtVAE.init..vae_loss at 0x7ff53051dd08> of type
'function' instead.
I need your help if you've got any suggestions.
There are several ways to implement VAE in Tensorflow. I propose an alternative implementation that can be found in custom_layers_and_models in Tensorflow guide pages :
Let's put all of these things together into an end-to-end example: we're going to implement a Variational AutoEncoder (VAE). We'll train it on MNIST digits.
It uses custom Model classes and the gradient tape. In this way, it is quite easy to add the classifier into the VAE model and add the categorical cross-entropy to the total loss during the optimization.
All you need is to modify:
class VariationalAutoEncoder(Model):
"""Combines the encoder and decoder into an end-to-end model for training."""
def __init__(
self,
original_dim,
intermediate_dim=64,
latent_dim=32,
name="autoencoder",
**kwargs
):
super(VariationalAutoEncoder, self).__init__(name=name, **kwargs)
self.original_dim = original_dim
self.encoder = Encoder(latent_dim=latent_dim, intermediate_dim=intermediate_dim)
self.decoder = Decoder(original_dim, intermediate_dim=intermediate_dim)
self.clf_supervised = Dense(10, activation='softmax', name='class_output')
def call(self, inputs):
z_mean, z_log_var, z = self.encoder(inputs)
reconstructed = self.decoder(z)
# Add KL divergence regularization loss.
kl_loss = -0.5 * tf.reduce_mean(
z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1
)
self.add_loss(kl_loss)
# classifier
y_pred = self.clf_supervised(z)
return reconstructed, y_pred
by adding the lines self.clf_supervised = Dense(10, activation='softmax', name='class_output') and y_pred = self.clf_supervised(z).
The optimization is done this way:
vae = VariationalAutoEncoder(original_dim, intermediate_dim, latent_dim)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
mse_loss_fn = tf.keras.losses.MeanSquaredError()
loss_metric = tf.keras.metrics.Mean()
epochs = 2
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=500).batch(4)
# Iterate over epochs.
for epoch in range(epochs):
print("Start of epoch %d" % (epoch,))
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
with tf.GradientTape() as tape:
reconstructed, y_pred = vae(x_batch_train)
clf_loss = tf.keras.losses.SparseCategoricalCrossentropy()(y_batch_train, y_pred)
# Compute reconstruction loss
loss = mse_loss_fn(x_batch_train, reconstructed)
loss += sum(vae.losses) # Add KLD regularization loss
loss += clf_loss
grads = tape.gradient(loss, vae.trainable_weights)
optimizer.apply_gradients(zip(grads, vae.trainable_weights))
loss_metric(loss)
if step % 100 == 0:
print("step %d: mean loss = %.4f" % (step, loss_metric.result()))
The rest of the code is in the link above. The main change is the optimization done with tf.GradientTape(). It's a bit more complicated than the fit method but it's still quite simple and very powerful.

Weighted Absolute Error implementation doesn't work in tensorflow (keras)

I have created custom loss (Weighted Absolute error) in keras but implementation doesn't work - I get an error ValueError: No gradients provided for any variable: ['my_model/conv2d/kernel:0', 'my_model/conv2d/bias:0'].
I want to apply different weight for each pixel.
class WeightedMeanAbsoluteError(tf.keras.metrics.Metric):
def __init__(self, name='weighted_mean_absolute_error'):
super(WeightedMeanAbsoluteError, self).__init__(name=name)
self.wmae = self.add_weight(name='wmae', initializer='zeros')
def update_state(self, y_true, y_pred, loss_weights):
values = tf.math.abs(y_true - y_pred) * loss_weights
return self.wmae.assign_add(tf.reduce_sum(values))
def result(self):
return self.wmae
def reset_states(self):
# The state of the metric will be reset at the start of each epoch.
self.wmae.assign(0.)
loss_object = WeightedMeanAbsoluteError()
train_loss = WeightedMeanAbsoluteError()
I use the following code to implement a training step:
#tf.function
def train_step(input_images, output_images):
with tf.GradientTape() as tape:
# training=True is only needed if there are layers with different
# behavior during training versus inference (e.g. Dropout).
result_images = model(input_images, training=True)
loss = loss_object(output_images, result_images)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
Also my code works just fine if I use
loss_object = tf.keras.losses.MeanAbsoluteError()
train_loss = tf.keras.metrics.MeanAbsoluteError()
The best and simple way to minimize a weighted standard loss (such mae) is using the sample_weights parameter in fit method where we pass an array with the desired weight of each sample
X = np.random.uniform(0,1, (1000,50))
y = np.random.uniform(0,1, 1000)
W = np.random.randint(1,10, 1000)
inp = Input((50))
x = Dense(64, activation='relu')(inp)
out = Dense(10)(x)
model = Model(inp, out)
model.compile('adam','mae')
model.fit(X,y, epochs=100, sample_weights=W)

How to obtain second derivatives of a Loss function with respect to the parameters of a neural network using gradient tape in Tensorflow eager mode

I am creating a basic auto-encoder for the MNIST dataset using TensorFlow eager mode. I would like to observe the second-order partial derivatives of my loss function with respect to the parameters of the network as it trains. Currently, calling tape.gradient() on the output of in_tape.gradient returns None (where in_tape is a GradientTape nested inside the outer GradientTape called tape, I have included my code below)
I have tried calling the tape.gradient() directly on the in_tape.gradient() with None being returned. My next approach was to iterate over the output of in_tape.gradient() and apply tape.gradient() to each gradient individually (with respect to my model variables) with None being returned each time.
I receive a single None value for any tape.gradient() call, not a list of None values which I believe would indicate None for a single partial derivative, which would be expected in some cases.
I am currently only trying to get the second derivatives for the first set of weights (from input to hidden layers), however, I will scale it to include all weights once I have this working.
tf.enable_eager_execution()
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.reshape((train_images.shape[0], train_images.shape[1]*train_images.shape[2])).astype(np.float32)/255
test_images = test_images.reshape((test_images.shape[0], test_images.shape[1]*test_images.shape[2])).astype(np.float32)/255
num_epochs = 200
batch_size = 100
learning_rate = 0.0003
class MNISTModel(tf.keras.Model):
def __init__(self, device='/gpu:0'):
super(MNISTModel, self).__init__()
self.device = device
self.initializer = tf.initializers.random_uniform(0.0, 0.5)
self.hidden = tf.keras.layers.Dense(200, use_bias=False, kernel_initializer=tf.initializers.random_uniform(0.0, 0.5), name="Hidden")
self.out = tf.keras.layers.Dense(train_images.shape[1], use_bias=False, kernel_initializer=tf.initializers.random_uniform(0.0, 0.5), name="Output")
self.hidden.build(train_images.shape[1])
self.out.build(200)
def call(self, x):
return self.out(self.hidden(x))
def loss_func(model, x, y_):
return tf.reduce_mean(tf.losses.mean_squared_error(labels=y_, predictions=model(x)))
#return tf.reduce_mean((y_ - model(x))**4)
model = MNISTModel()
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
for epochs in range(num_epochs):
print("Started epoch ", epochs)
print("Num batches is: ", train_images.shape[0]/batch_size)
for i in range(0,1): #(int(train_images.shape[0]/batch_size)):
with tfe.GradientTape(persistent=True) as tape:
tape.watch(model.variables)
with tfe.GradientTape() as in_tape:
in_tape.watch(model.variables)
loss = loss_func(model,train_images[0:batch_size],train_images[0:batch_size])
grads = tape.gradient(loss, model.variables)
IH_partial_grads = np.array([])
for i in range(len(grads[0])):
collector = np.array([])
for j in range(len(grads[0][i])):
collector = np.append(collector, tape.gradient(grads[0][i][j], model.variables[0]))
IH_partial_grads = np.append(IH_partial_grads, collector)
optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
print("Epoch test loss: ", loss_func(model, test_images, test_images))
My ultimate goal is to form the hessian matrix for the loss function with respect to all parameters of my network.
Thanks for any and all help!

What is the proper way to weight decay for Adam Optimizer

Since Adam Optimizer keeps an pair of running averages like mean/variance for the gradients, I wonder how it should properly handle weight decay. I have seen two ways of implementing it.
Only update mean/variance from the gradients based on the objective loss, decay weight explicitly at each mini-batch. (the following code is taken from https://github.com/dmlc/mxnet/blob/v0.7.0/python/mxnet/optimizer.py)
weight[:] -= lr*mean/(sqrt(variance) + self.epsilon)
wd = self._get_wd(index)
if wd > 0.:
weight[:] -= (lr * wd) * weight
Update mean/variance from the gradients based on the objective loss + regularization loss, and update weights like usual. (the following code is taken from https://github.com/dmlc/mxnet/blob/master/src/operator/optimizer_op-inl.h#L210)
grad = scalar<DType>(param.rescale_grad) * grad +
scalar<DType>(param.wd) * weight;
// stuff
Assign(out, req[0],
weight -
scalar<DType>(param.lr) * mean /
(F<square_root>(var) + scalar<DType>(param.epsilon)));
These two approaches sometimes show significant difference in training results. And I actually think the first one makes more sense (and find it gives better results time to time). Caffe and old version of mxnet follow the first approach, while torch, tensorflow and new version of mxnet follow the second one.
Really appreciate your help!
Edit: see also this PR which just got merged into TF.
When using pure SGD (without momentum) as an optimizer, weight decay is the same thing as adding a L2-regularization term to the loss. When using any other optimizer, this is not true.
Weight decay (don't know how to TeX here, so excuse my pseudo-notation):
w[t+1] = w[t] - learning_rate * dw - weight_decay * w
L2-regularization:
loss = actual_loss + lambda * 1/2 sum(||w||_2 for w in network_params)
Computing the gradient of the extra term in L2-regularization gives lambda * w and thus inserting it into the SGD update equation
dloss_dw = dactual_loss_dw + lambda * w
w[t+1] = w[t] - learning_rate * dw
gives the same as weight decay, but mixes lambda with the learning_rate. Any other optimizer, even SGD with momentum, gives a different update rule for weight decay as for L2-regularization! See the paper Fixing weight decay in Adam for more details. (Edit: AFAIK, this 1987 Hinton paper introduced "weight decay", literally as "each time the weights are updated, their magnitude is also decremented by 0.4%" at page 10)
That being said, there doesn't seem to be support for "proper" weight decay in TensorFlow yet. There are a few issues discussing it, specifically because of above paper.
One possible way to implement it is by writing an op that does the decay step manually after every optimizer step. A different way, which is what I'm currently doing, is using an additional SGD optimizer just for the weight decay, and "attaching" it to your train_op. Both of these are just crude work-arounds, though. My current code:
# In the network definition:
with arg_scope([layers.conv2d, layers.dense],
weights_regularizer=layers.l2_regularizer(weight_decay)):
# define the network.
loss = # compute the actual loss of your problem.
train_op = optimizer.minimize(loss, global_step=global_step)
if args.weight_decay not in (None, 0):
with tf.control_dependencies([train_op]):
sgd = tf.train.GradientDescentOptimizer(learning_rate=1.0)
train_op = sgd.minimize(tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)))
This somewhat makes use of TensorFlow's provided bookkeeping. Note that the arg_scope takes care of appending an L2-regularization term for every layer to the REGULARIZATION_LOSSES graph-key, which I then all sum up and optimize using SGD which, as shown above, corresponds to actual weight-decay.
Hope that helps, and if anyone gets a nicer code snippet for this, or TensorFlow implements it better (i.e. in the optimizers), please share.
I came across the same question. I think this code that I got from here will work for you. It implements the weight decay adam optimizer by inheritance from the tf.train.Optimizer. This is the cleanest solution I have found:
class AdamWeightDecayOptimizer(tf.train.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="AdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(AdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# Standard Adam update.
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want ot decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param
update_with_lr = self.learning_rate * update
next_param = param - update_with_lr
assignments.extend(
[param.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
And you can use it in the following way (I have made some changes to make it useful in a more general context), This function will return a train_op that can be used in the Session:
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6)
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# You can do clip gradients if you need in this step(in general it is not neccessary)
# (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)
# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op

Add a summary of accuracy of the whole train/test dataset in Tensorflow

I am trying to use Tensorboard to visualize my training procedure. My purpose is, when every epoch completed, I would like to test the network's accuracy using the whole validation dataset, and store this accuracy result into a summary file, so that I can visualize it in Tensorboard.
I know Tensorflow has summary_op to do it, however it seems only work for one batch when running the code sess.run(summary_op). I need to calculate the accuracy for the whole dataset. How?
Is there any example to do it?
Define a tf.scalar_summary that accepts a placeholder:
accuracy_value_ = tf.placeholder(tf.float32, shape=())
accuracy_summary = tf.scalar_summary('accuracy', accuracy_value_)
Then calculate the accuracy for the whole dataset (define a routine that calculates the accuracy for every batch in the dataset and extract the mean value) and save it into a python variable, let's call it va.
Once you have the value of va, just run the accuracy_summary op, feeding the accuracy_value_ placeholder:
sess.run(accuracy_summary, feed_dict={accuracy_value_: va})
I implement a naive one-layer model as an example to classify MNIST dataset and visualize validation accuracy in Tensorboard, it works for me.
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
import os
# number of epoch
num_epoch = 1000
model_dir = '/tmp/tf/onelayer_model/accu_info'
# mnist dataset location, change if you need
data_dir = '../data/mnist'
# load MNIST dataset without one hot
dataset = read_data_sets(data_dir, one_hot=False)
# Create placeholder for input images X and labels y
X = tf.placeholder(tf.float32, [None, 784])
# one_hot = False
y = tf.placeholder(tf.int32)
# One layer model graph
W = tf.Variable(tf.truncated_normal([784, 10], stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[10]))
logits = tf.nn.relu(tf.matmul(X, W) + b)
init = tf.initialize_all_variables()
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)
# loss function
loss = tf.reduce_mean(cross_entropy)
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
_, top_1_op = tf.nn.top_k(logits)
top_1 = tf.reshape(top_1_op, shape=[-1])
correct_classification = tf.cast(tf.equal(top_1, y), tf.float32)
# accuracy function
acc = tf.reduce_mean(correct_classification)
# define info that is used in SummaryWritter
acc_summary = tf.scalar_summary('valid_accuracy', acc)
valid_summary_op = tf.merge_summary([acc_summary])
with tf.Session() as sess:
# initialize all the variable
sess.run(init)
print("Writing Summaries to %s" % model_dir)
train_summary_writer = tf.train.SummaryWriter(model_dir, sess.graph)
# load validation dataset
valid_x = dataset.validation.images
valid_y = dataset.validation.labels
for epoch in xrange(num_epoch):
batch_x, batch_y = dataset.train.next_batch(100)
feed_dict = {X: batch_x, y: batch_y}
_, acc_value, loss_value = sess.run(
[train_op, acc, loss], feed_dict=feed_dict)
vsummary = sess.run(valid_summary_op,
feed_dict={X: valid_x,
y: valid_y})
# Write validation accuracy summary
train_summary_writer.add_summary(vsummary, epoch)
Using batching with your validation set is possible in case you are using tf.metrics ops, which use internal counters. Here is a simplified example:
model = create_model()
tf.summary.scalar('cost', model.cost_op)
acc_value_op, acc_update_op = tf.metrics.accuracy(labels,predictions)
summary_common = tf.summary.merge_all()
summary_valid = tf.summary.merge([
tf.summary.scalar('accuracy', acc_value_op),
# other metrics here...
])
with tf.Session() as sess:
train_writer = tf.summary.FileWriter(logs_path + '/train',
sess.graph)
valid_writer = tf.summary.FileWriter(logs_path + '/valid')
While training, only write the common summary using your train-writer:
summary = sess.run(summary_common)
train_writer.add_summary(summary, tf.train.global_step(sess, gstep_op))
train_writer.flush()
After every validation, write both summaries using the valid-writer:
gstep, summaryc, summaryv = sess.run([gstep_op, summary_common, summary_valid])
valid_writer.add_summary(summaryc, gstep)
valid_writer.add_summary(summaryv, gstep)
valid_writer.flush()
When using tf.metrics, don't forget to reset the internal counters (local variables) before every validation step.