Distributed Tensorflow: good example for synchronous training on CPUs - tensorflow

I am new to distributed tensorflow and am looking for a good example to perform synchronous training on CPUs.
I have already tried the Distributed Tensorflow Example and it can perform the asynchronous training successfully over 1 parameter server (1 machine with 1 CPU) and 3 workers (each worker = 1 machine with 1 CPU). However, when it comes to the synchronous training, I am not able to run it correctly, although I have followed the tutorial of
SyncReplicasOptimizer(V1.0 and V2.0).
I have inserted the official SyncReplicasOptimizer code into the working asynchronous training example but the training process is still asynchronous. My detailed code is as follows. Any code relates to synchronous training is within the block of ******.
import tensorflow as tf
import sys
import time
# cluster specification ----------------------------------------------------------------------
parameter_servers = ["xx1.edu:2222"]
workers = ["xx2.edu:2222", "xx3.edu:2222", "xx4.edu:2222"]
cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
# start a server for a specific task
server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
# Parameters ----------------------------------------------------------------------
N = 3 # number of replicas
learning_rate = 0.001
training_epochs = int(21/N)
batch_size = 100
# Network Parameters
n_input = 784 # MNIST data input (img shape: 28*28)
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_classes = 10 # MNIST total classes (0-9 digits)
if FLAGS.job_name == "ps":
server.join()
print("--- Parameter Server Ready ---")
elif FLAGS.job_name == "worker":
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False,
dtype = tf.int32)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Output layer with linear activation
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
# ************************* SyncReplicasOpt Version 1.0 *****************************************************
''' This optimizer collects gradients from all replicas, "summing" them,
then applying them to the variables in one shot, after which replicas can fetch the new variables and continue. '''
# Create any optimizer to update the variables, say a simple SGD
opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
# Wrap the optimizer with sync_replicas_optimizer with N replicas: at each step the optimizer collects N gradients before applying to variables.
opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=N,
replica_id=FLAGS.task_index, total_num_replicas=N)
# Now you can call `minimize()` or `compute_gradients()` and `apply_gradients()` normally
train = opt.minimize(cost, global_step=global_step)
# You can now call get_init_tokens_op() and get_chief_queue_runner().
# Note that get_init_tokens_op() must be called before creating session
# because it modifies the graph.
init_token_op = opt.get_init_tokens_op()
chief_queue_runner = opt.get_chief_queue_runner()
# **************************************************************************************
# Test model
correct = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, "float"))
# Initializing the variables
init_op = tf.initialize_all_variables()
print("---Variables initialized---")
# **************************************************************************************
is_chief = (FLAGS.task_index == 0)
# Create a "supervisor", which oversees the training process.
sv = tf.train.Supervisor(is_chief=is_chief,
logdir="/tmp/train_logs",
init_op=init_op,
global_step=global_step,
save_model_secs=600)
# **************************************************************************************
with sv.prepare_or_wait_for_session(server.target) as sess:
# **************************************************************************************
# After the session is created by the Supervisor and before the main while loop:
if is_chief:
sv.start_queue_runners(sess, [chief_queue_runner])
# Insert initial tokens to the queue.
sess.run(init_token_op)
# **************************************************************************************
# Statistics
net_train_t = 0
# Training
for epoch in range(training_epochs):
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# ======== net training time ========
begin_t = time.time()
sess.run(train, feed_dict={x: batch_x, y: batch_y})
end_t = time.time()
net_train_t += (end_t - begin_t)
# ===================================
# Calculate training accuracy
# acc = sess.run(accuracy, feed_dict={x: mnist.train.images, y: mnist.train.labels})
# print("Epoch:", '%04d' % (epoch+1), " Train Accuracy =", acc)
print("Epoch:", '%04d' % (epoch+1))
print("Training Finished!")
print("Net Training Time: ", net_train_t, "second")
# Testing
print("Testing Accuracy = ", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
sv.stop()
print("done")
Anything wrong with my code? Or can I have a good example to follow?

I think your question can be answered as the comments in the issue #9596 of the tensorflow.
This problem is caused by the bugs of the new version of tf.train.SyncReplicasOptimizer(). You can use old version of this API to avoid this problem.
Another solution is from the Tensorflow Distributed Benchmarks. Take a look at the source code, and you can find that they synchronize workers manually through the queue in the tensorflow. Through experiments, this benchmark runs exactly as what you expect.
Hope these comments and resources can help you solve your problem. Thanks!

I am not sure if you would be interested in user-transparent distributed tensorflow which uses MPI in the backend. We have recently developed one such version with MaTEx: https://github.com/matex-org/matex.
Hence, for distributed TensorFlow, you would not need to write a SyncReplicaOptimizer code, since all the changes are abstracted from the user.
Hope this helps.

One issue is that you need to specify an aggregation_method in the minimize method for it to run synchronously,
train = opt.minimize(cost, global_step=global_step, aggregation_method=tf.AggregationMethod.ADD_N)

Related

How to view the graphs on tensorboard with eventFiles already present in Folders?

I have the tf.event files present in folder, I input the command to view but yet I am not able to see the graph
Please find the code attached, the code related to graph is provided.
I am using tensorflow 1.8, upgrading had lot of issues, so i am using lower version.
#Initialize the FileWriter
with tf.Session() as sess:
writer = tf.summary.FileWriter("./Training_FileWriter/", sess.graph)
writer1 = tf.summary.FileWriter("./Validation_FileWriter/", sess.graph)
#Add the cost and accuracy to summary
tf.summary.scalar('loss', tf.squeeze(cross_entropy))
tf.summary.scalar('accuracy', tf.squeeze(accuracy))
#Merge all summaries together
merged_summary = tf.summary.merge_all()
#
#
#After executing loss, optimizer, accuracy
summ = sess.run(merged_summary, feed_dict=feed_dict_train)
writer.add_summary(summ, epoch*int(len(trainLabels)/batch_size) + batch)
Will it help if you have a full-fledged example like this ? I am able to view the graphs.
tensorboard --logdir=D:\Development_Avecto\TensorFlow\logs\1\train
TensorBoard 1.9.0 at http://LT032871:6006 (Press CTRL+C to quit)
import tensorflow as tf
# reset everything to rerun in jupyter
tf.reset_default_graph()
# config
batch_size = 100
learning_rate = 0.5
training_epochs = 5
logs_path = "D:/Development_Avecto/TensorFlow/logs/1/train"
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
with tf.name_scope("weights"):
W = tf.Variable(tf.zeros([784, 10]))
# bias
with tf.name_scope("biases"):
b = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
y = tf.nn.softmax(tf.matmul(x, W) + b)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
with tf.name_scope('Accuracy'):
# Accuracy
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.summary.scalar("cost", cross_entropy)
tf.summary.scalar("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.summary.merge_all()
with tf.Session() as sess:
# variables need to be initialized before we can use them
sess.run(tf.initialize_all_variables())
# create log writer object
writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(mnist.train.num_examples / batch_size)
for i in range(batch_count):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# perform the operations we defined earlier on batch
_, summary = sess.run([train_op, summary_op], feed_dict={x: batch_x, y_: batch_y})
# write log
writer.add_summary(summary, epoch * batch_count + i)
if epoch % 5 == 0:
print
"Epoch: ", epoch
print
"Accuracy: ", accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels})
print
"done"

How to switch from GradientDescent Optimizer to Adam in Tensorflow

My code is running perfectly with Gradient Descent, but I want to compare the effectiveness of my algorithm using Adam Optimizer, so I tried to modify the following code:
# Import MNIST data
#import input_data
#mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
#fashion_mnist = input_data.read_data_sets('data/fashion')
import tensorflow as tf
# Set parameters
learning_rate = 0.01 #1e-4
training_iteration = 30
batch_size = 100
display_step = 2
# TF graph input
x = tf.placeholder("float", [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder("float", [None, 10]) # 0-9 digits recognition => 10 classes
#regularizer = tf.reduce_sum(tf.square(y))
# Create a model
# Set model weights
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
with tf.name_scope("Wx_b") as scope:
# Construct a linear model
model = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax
# Add summary ops to collect data
w_h = tf.summary.histogram("weights", W)
b_h = tf.summary.histogram("biases", b)
# More name scopes will clean up graph representation
with tf.name_scope("cost_function") as scope:
# Minimize error using cross entropy
# Cross entropy
cost_function = -tf.reduce_sum(y*tf.log(model))
# Create a summary to monitor the cost function
tf.summary.scalar("cost_function", cost_function)
with tf.name_scope("train") as scope:
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
# Initializing the variables
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
# Merge all summaries into a single operator
merged_summary_op = tf.summary.merge_all()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
summary_writer = tf.summary.FileWriter('/home/raed/Tensorflow/tensorflow_demo', graph_def =sess.graph_def)
#writer.add_graph(sess.graph_def)
# Training cycle
for iteration in range(training_iteration):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
# Fit training using batch data
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
# Compute the average loss
avg_cost += sess.run(cost_function, feed_dict={x: batch_xs, y: batch_ys})/total_batch
# Write logs for each iteration
summary_str = sess.run(merged_summary_op, feed_dict={x: batch_xs, y: batch_ys})
summary_writer.add_summary(summary_str, iteration*total_batch + i)
# Display logs per iteration step
if iteration % display_step == 0:
print ("Iteration:" "%04d" % (iteration + 1), "cost=", "{:.9f}".format(avg_cost))
print ("Tuning completed!")
# Test the model
predictions = tf.equal(tf.argmax(model, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(predictions, "float"))
print ("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
to use Adam Optimizer I tried to change the following line :
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
and replace it with the AdamOptimizer :
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_function)
when I ran the code , I got few iteration and then it stopped with the following error.
InvalidArgumentError (see above for traceback): Nan in summary histogram for: weights
[[Node: weights = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](weights/tag, Variable/read)]]
could you please help me understnad the problem , thanks in advance
the problem is weights are initialized to zero W = tf.Variable(tf.zeros([784, 10])) that`s why you re get Nan as weights.
you need to inialize them with some initializer i.e normal distribution as follow
W = tf.Variable(tf.random_normal([784, 10], stddev=0.35),
name="weights")

L2 regularization keep increasing during training

I am finetuning InceptionResnetV2 on TensorFlow. When training, the regularization loss keep linearly increasing and even much larger than cross entropy loss in the later stage of training. I have checked the training procedure, and make sure I am optimizing the cross entropy loss and L2 loss combined.
Is there anyone explain this weird thing a little bit? Any feedback is appreciated.
Here is the code and some TensorBoard plots.
import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
from preprocessing import aug_parallel_v2
import numpy as np
slim = tf.contrib.slim
# total training data number
sample_num = 625020
data_path = 'iNaturalist_train.tfrecords'
# State where your log file is at. If it doesn't exist, create it.
log_dir = './log_v5'
# tensorboard visualization path
filewriter_path = './filewriter_v5_Logits'
# State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
checkpoint_save_addr = './log_v5/fine-tuning_v5.ckpt'
# State the image size you're resizing your images to. We will use the default inception size of 299.
image_size = 299
# State the number of classes to predict:
num_classes = 8142
# ================= TRAINING INFORMATION ==================
# State the number of epochs to train
num_epochs = 5
# State your batch size
batch_size = 60
# Learning rate information and configuration
initial_learning_rate = 0.0005
learning_rate_decay_factor = 0.8
num_epochs_before_decay = 2
# put weight on different classes inversely proportional
# to total number of their image samples
label_count = np.loadtxt('label_count.txt', dtype=int)
inverse = lambda t: 1 / t
vfunc = np.vectorize(inverse)
multiplier = vfunc(label_count)
multiplier /= np.mean(multiplier)
def run():
if not os.path.exists(log_dir):
os.mkdir(log_dir)
feature = {'train/height': tf.FixedLenFeature([], tf.int64),
'train/width': tf.FixedLenFeature([], tf.int64),
'train/image': tf.FixedLenFeature([], tf.string),
'train/label': tf.FixedLenFeature([], tf.int64),
'train/sup_label': tf.FixedLenFeature([], tf.int64),
'train/aug_level': tf.FixedLenFeature([], tf.int64)}
# create a list of file names
filename_queue = tf.train.string_input_producer([data_path], num_epochs=None)
print(filename_queue)
reader = tf.TFRecordReader()
_, tfrecord_serialized = reader.read(filename_queue)
features = tf.parse_single_example(tfrecord_serialized, features=feature)
# Convert the image data from string back to the numbers
height = tf.cast(features['train/height'], tf.int64)
width = tf.cast(features['train/width'], tf.int64)
# change this line for your TFrecord version
tf_image = tf.image.decode_jpeg(features['train/image'])
tf_label = tf.cast(features['train/label'], tf.int32)
aug_level = tf.cast(features['train/aug_level'], tf.int32)
# tf_sup_label = tf.cast(features['train/sup_label'], tf.int64)
tf_image = tf.reshape(tf_image, tf.stack([height, width, 3]))
tf_label = tf.reshape(tf_label, [1])
aug_level = tf.reshape(aug_level, [1])
resized_image = tf.image.resize_images(images=tf_image, size=tf.constant([400, 400]), method=2)
resized_image = tf.cast(resized_image, tf.uint8)
tf_images, tf_labels, tf_aug = tf.train.shuffle_batch([resized_image, tf_label, aug_level], batch_size=batch_size,
capacity=2048, num_threads=16, allow_smaller_final_batch=False,
min_after_dequeue=256)
tf.logging.set_verbosity(tf.logging.INFO) # Set the verbosity to INFO level
IMAGE_HEIGHT = 299
IMAGE_WIDTH = 299
images = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3])
labels = tf.placeholder(dtype=tf.int32, shape=[None, 1])
weighted_level = tf.placeholder(dtype=tf.float32, shape=[None, 1])
# Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(sample_num / batch_size)
num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
# Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(images, num_classes=num_classes, is_training=True)
# Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude=exclude)
print("label test")
print(labels)
print(logits)
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = tf.squeeze(tf.one_hot(labels, num_classes), [1])
print(one_hot_labels)
print(logits)
weighted_onehot = tf.multiply(one_hot_labels, weighted_level)
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
digits_loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_onehot, logits=logits)
reg_loss = tf.losses.get_regularization_loss()
total_loss = digits_loss + reg_loss
# Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate=initial_learning_rate,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=learning_rate_decay_factor,
staircase=True)
# train_vars = []
# Now we can define the optimizer that takes on the learning rate
train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
"InceptionResnetV2/Logits")
# RMSProp or Adam
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
# Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer, variables_to_train=train_vars)
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.metrics.accuracy(predictions, labels)
metrics_op = tf.group(accuracy_update, probabilities)
tf.summary.scalar('losses/Reg_Loss', reg_loss)
tf.summary.scalar('losses/Digit_Loss', digits_loss)
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
writer = tf.summary.FileWriter(filewriter_path)
writer.add_graph(tf.get_default_graph())
my_summary_op = tf.summary.merge_all()
def train_step(sess, train_op, global_step, imgs, lbls, weight):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed
for each global step
'''
# Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
time_elapsed = time.time() - start_time
# Run the logging to print some results
logging.info('global step %s: digit_loss: %.4f (%.2f sec/step)',
global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
saver_pretrain = tf.train.Saver(variables_to_restore)
saver_train = tf.train.Saver(train_vars)
with tf.Session() as sess:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
# Create a coordinator and run all QueueRunner objects
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
saver_pretrain.restore(sess, checkpoint_file)
start_time = time.time()
for step in range(int(num_steps_per_epoch * num_epochs)):
imgs, lbls, augs = sess.run([tf_images, tf_labels, tf_aug])
imgs, lbls = aug_parallel_v2(imgs, lbls, augs)
imgs = imgs[:, 50:349, 50:349, :]
imgs = 2*(imgs.astype(np.float32)) - 1
lbls = lbls.astype(np.int32)
weight = multiplier[lbls]
weight = np.array(weight).reshape((batch_size, 1))
# print(imgs[0, 0:10, 0:10, 0:2])
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run(
[logits, probabilities, predictions, labels],
feed_dict={images: imgs, labels: lbls, weighted_level: weight})
print('logits: \n', logits_value)
print('Probabilities: \n', probabilities_value)
print('predictions: \n', predictions_value)
print('Labels:\n:', labels_value)
# Log the summaries every 10 step.
if step % 20 == 0:
loss, global_step_count = train_step(sess, train_op, global_step, imgs, lbls, weight)
summaries = sess.run(my_summary_op, feed_dict={images: imgs, labels: lbls, weighted_level: weight})
writer.add_summary(summaries, global_step_count)
# sess.summary_computed(sess, summaries)
# If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, global_step, imgs, lbls, weight)
if step % 2000 == 0:
logging.info('Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
print('one batch time: ', time.time() - start_time)
start_time = time.time()
# We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
# Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
saver_train.save(sess, checkpoint_save_addr, global_step=global_step)
# Stop the threads
coord.request_stop()
# Wait for threads to stop
coord.join(threads)
sess.close()
if __name__ == '__main__':
run()
I am new here, and don't have enough reputation to post images.
Here are two links for the accuracy plot and losses plot. You can easily tell the regularization loss is in a dominant position.
This is a difficult question to answer. I can give some pointers though.
In general, when you try to minimize digits_loss, that is to fit your model to your data, you will slowly change the weights in your layers. To counter potential overfitting, a L2 regularization loss (the sum of the squares of all weights, reg_loss in your code) is generally added to the overall loss (total_loss in your code.) These two forces generally act against each other and if the balance is right, you train a good model.
In your case you're taking a network (resnet_v2) that was developed for 1,001 classes and try to predict 8,142 classes. No problem with that per se, but you're upsetting the balance. So I believe you need to override the default weight decay of 0.00004 for resnet v2 to some higher value, in this line (note only 3 zeros in the decimals for a 10x increase):
with slim.arg_scope( inception_resnet_v2_arg_scope( weight_decay = 0.0004 ) ):
A higher weight_decay parameter will force the L2 loss to decrease faster. The problem is that this number is just a guess, I have no idea what an ideal value would be. You need to experiment with multiple values and figure it out.

Can i generate the input given the output in a pretrained Tensorflow model?

Let's assume i have trained a model for the MNist task, given the following code:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1
# Network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Output layer with linear activation
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
avg_acc = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
batch_acc = accuracy.eval({x: batch_x, y: batch_y})
# Compute average loss
avg_cost += c / total_batch
avg_acc += batch_acc / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
test_acc = accuracy.eval({x: mnist.test.images, y: mnist.test.labels})
print(
"Epoch:",
'%04d' % (epoch+1),
"cost=",
"{:.9f}".format(avg_cost),
"average_train_accuracy=",
"{:.6f}".format(avg_acc),
"test_accuracy=",
"{:.6f}".format(test_acc)
)
print("Optimization Finished!")
So this model predicts the number shown in an image given the image.
Once i have trained it, could i make the input a 'variable' instead of 'placeholder' and try to reverse engineer the input given an output ?
For example i would like to feed the output '8' and produce a representative image of number eight.
I thought of:
Freezing the model
Add a variable matrix 'M' of the same size as the input between the input and the weights
Feed an Identical matrix as input to the input placeholder
Run the optimizer to learn the 'M' matrix.
Is there a better way ?
If your goal is to reverse the model in the sense that the input should be a digit and the output an image displaying that digit (in say, handwritten form), it is not quite possible to do with machine learning models.
Because machine learning models attempt to create generalizations from the input (so that similar input will provide similar output, although the model was never trained on it) they tend to be quite lossy. Additionally, the reduction from hundreds, thousands and more input variables into a single output variable obviously has to lose some information in the process.
More specifically, although a Multilayer Perceptron (as you're using in your example) is a fully connected Neural Network, some weights are expected to be zero, thus completely dropping the information in certain input variables. Moverover, the same output of a neuron can be retrieved by multiple distinctive input values to it's function, due to the many degrees of freedom.
It is theoretically possible to replace those degrees of freedom and lost information with specifically crafted or random data, but that does not guarantee a successful output.
On a side note, I'm a bit puzzled by this question. If you are able to generate that model yourself, you could also create a similar model that does the opposite. You could train a model to accept an input digit (and perhaps some random seed) and output an image.

Run distributed tensorflow example with error

I have three node to run a distributed tensorflow, which is two worker(one has GPU,one not)and one ps(without GPU).The code is below:
from __future__ import print_function
import tensorflow as tf
import sys
import time
# cluster specification
parameter_servers = ["192.168.1.102:2222"]
workers = [ "192.168.1.103:2223",
"192.168.1.104:2224"]
cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
# start a server for a specific task
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
# config
batch_size = 100
learning_rate = 0.001
training_epochs = 20
logs_path = "/tmp/mnist/1"
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
# Between-graph replication
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
# count the number of updates
global_step = tf.get_variable('global_step', [],
initializer = tf.constant_initializer(0),
trainable = False)
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
tf.set_random_seed(1)
with tf.name_scope("weights"):
W1 = tf.Variable(tf.random_normal([784, 100]))
W2 = tf.Variable(tf.random_normal([100, 10]))
# bias
with tf.name_scope("biases"):
b1 = tf.Variable(tf.zeros([100]))
b2 = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
z2 = tf.add(tf.matmul(x,W1),b1)
a2 = tf.nn.sigmoid(z2)
z3 = tf.add(tf.matmul(a2,W2),b2)
y = tf.nn.softmax(z3)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
grad_op = tf.train.GradientDescentOptimizer(learning_rate)
train_op = grad_op.minimize(cross_entropy, global_step=global_step)
with tf.name_scope('Accuracy'):
# accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.scalar_summary("cost", cross_entropy)
tf.scalar_summary("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.merge_all_summaries()
init_op = tf.initialize_all_variables()
print("Variables initialized ...")
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op)
begin_time = time.time()
frequency = 100
with sv.prepare_or_wait_for_session(server.target) as sess:
# create log writer object (this will log on every machine)
writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
start_time = time.time()
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(mnist.train.num_examples/batch_size)
count = 0
for i in range(batch_count):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# perform the operations we defined earlier on batch
_, cost, summary, step = sess.run(
[train_op, cross_entropy, summary_op, global_step],
feed_dict={x: batch_x, y_: batch_y})
writer.add_summary(summary, step)
count += 1
if count % frequency == 0 or i+1 == batch_count:
elapsed_time = time.time() - start_time
start_time = time.time()
print("Step: %d," % (step+1),
" Epoch: %2d," % (epoch+1),
" Batch: %3d of %3d," % (i+1, batch_count),
" Cost: %.4f," % cost,
" AvgTime: %3.2fms" % float(elapsed_time*1000/frequency))
count = 0
print("Test-Accuracy: %2.2f" % sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
print("Total Time: %3.2fs" % float(time.time() - begin_time))
print("Final Cost: %.4f" % cost)
sv.stop()
print("done")
I run the above code on my three node with instruction below in terminal:
pc-01$ python example.py --job-name="ps" --task_index=0
pc-02$ python example.py --job-name="worker" --task_index=0
pc-03$ python example.py --job-name="worker" --task_index=1
However, after the Variables initialized, I met a question that the terminal of worker always print :
I tensor flow/core/distributed_runtime/master.cc:193] CreateSession still waiting for response from worker:/job:worker/replica:0/task:0
and the terminal of ps don't proceed.
The IP of ps is 192.168.1.102, and the IP of the worker is 192.168.1.103,192.168.1.104,just like the code above.
Anyone can help me?
I guess filtering out device should help here. Could you please try adding device_filter to your session ?
config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
with sv.prepare_or_wait_for_session(server.target, config=\config) as sess:
This should fix the issue.