Multi GPU CIFAR10 example taking more time as the number of GPU is increasing. I am using eight Tesla K80 GPU - tensorflow

I am running Multi GPU CIFAR10 example. I observed that as I am increasing the number of GPU in the example, time taken to train is increasing.
nvidia-smi -l 1 command shows the expected utilization and behavior of the GPUs, but the time taken to train model is taking more with more number of GPUs which is unexpected.
I don't know if I am missing any configuration settings before running the example.
I also tried to run MNIST on multi GPUs and I faced similar problem with GPU. Basically I was trying to collect some statistics for multi GPU.
As I am increasing number of GPUs by giving values for i in xrange(num_gpus): I am seeing more time is being taken. Is anything wrong with code?
start_time = time.time()
def train():
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
#y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False)
op = tf.train.GradientDescentOptimizer(0.5)
for i in xrange(4):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
#batch_xs, batch_ys = mnist.train.next_batch(100)
#batch_xs, batch_ys = queue.dequeue_many(100)
y = tf.nn.softmax(tf.matmul(x, W) + b)
#print(batch_xs)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
tower_gradient = op.compute_gradients(cross_entropy)
tower_grads.append(tower_gradient)
grads = average_gradients(tower_grads)
apply_gradient_op = op.apply_gradients(grads, global_step=global_step)
sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
#coord = tf.train.Coordinator()
#enqueue_threads = qr.create_threads(sess, coord=coord, start=True)
tf.global_variables_initializer().run()
for _ in range(1000):
data_batch, label_batch = mnist.train.next_batch(100)
#data_batch, label_batch = sess.run([batch_xs,batch_ys])
sess.run(apply_gradient_op,feed_dict={x:data_batch, y_:label_batch})
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
#coord.request_stop()
#coord.join(enqueue_threads)
sess.close()
train()
print("--- %s seconds ---" % (time.time() - start_time))
Thanks & Regards

Related

How to view the graphs on tensorboard with eventFiles already present in Folders?

I have the tf.event files present in folder, I input the command to view but yet I am not able to see the graph
Please find the code attached, the code related to graph is provided.
I am using tensorflow 1.8, upgrading had lot of issues, so i am using lower version.
#Initialize the FileWriter
with tf.Session() as sess:
writer = tf.summary.FileWriter("./Training_FileWriter/", sess.graph)
writer1 = tf.summary.FileWriter("./Validation_FileWriter/", sess.graph)
#Add the cost and accuracy to summary
tf.summary.scalar('loss', tf.squeeze(cross_entropy))
tf.summary.scalar('accuracy', tf.squeeze(accuracy))
#Merge all summaries together
merged_summary = tf.summary.merge_all()
#
#
#After executing loss, optimizer, accuracy
summ = sess.run(merged_summary, feed_dict=feed_dict_train)
writer.add_summary(summ, epoch*int(len(trainLabels)/batch_size) + batch)
Will it help if you have a full-fledged example like this ? I am able to view the graphs.
tensorboard --logdir=D:\Development_Avecto\TensorFlow\logs\1\train
TensorBoard 1.9.0 at http://LT032871:6006 (Press CTRL+C to quit)
import tensorflow as tf
# reset everything to rerun in jupyter
tf.reset_default_graph()
# config
batch_size = 100
learning_rate = 0.5
training_epochs = 5
logs_path = "D:/Development_Avecto/TensorFlow/logs/1/train"
# load mnist data set
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# input images
with tf.name_scope('input'):
# None -> batch size can be any size, 784 -> flattened mnist image
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
# target 10 output classes
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
# model parameters will change during training so we use tf.Variable
with tf.name_scope("weights"):
W = tf.Variable(tf.zeros([784, 10]))
# bias
with tf.name_scope("biases"):
b = tf.Variable(tf.zeros([10]))
# implement model
with tf.name_scope("softmax"):
# y is our prediction
y = tf.nn.softmax(tf.matmul(x, W) + b)
# specify cost function
with tf.name_scope('cross_entropy'):
# this is our cost
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
# specify optimizer
with tf.name_scope('train'):
# optimizer is an "operation" which we can execute in a session
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
with tf.name_scope('Accuracy'):
# Accuracy
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# create a summary for our cost and accuracy
tf.summary.scalar("cost", cross_entropy)
tf.summary.scalar("accuracy", accuracy)
# merge all summaries into a single "operation" which we can execute in a session
summary_op = tf.summary.merge_all()
with tf.Session() as sess:
# variables need to be initialized before we can use them
sess.run(tf.initialize_all_variables())
# create log writer object
writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
# perform training cycles
for epoch in range(training_epochs):
# number of batches in one epoch
batch_count = int(mnist.train.num_examples / batch_size)
for i in range(batch_count):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# perform the operations we defined earlier on batch
_, summary = sess.run([train_op, summary_op], feed_dict={x: batch_x, y_: batch_y})
# write log
writer.add_summary(summary, epoch * batch_count + i)
if epoch % 5 == 0:
print
"Epoch: ", epoch
print
"Accuracy: ", accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels})
print
"done"

2 Layer Neural Network Does not Converge

Background
I am a newbie to TensorFlow and I am trying to understand the basics of deep learning. I started from writing a two-layer neural network from scratch and it achieved 89% accuracy on MNIST dataset and now I am trying to implement the same network in TensorFlow and compare their performance.
Problem
I am not sure if I miss something basic in the code, but the following implementation seems to be unable to update weights and therefore could not output anything meaningful.
num_hidden = 100
# x -> (batch_size, 784)
x = tf.placeholder(tf.float32, [None, 784])
W1 = tf.Variable(tf.zeros((784, num_hidden)))
b1 = tf.Variable(tf.zeros((1, num_hidden)))
W2 = tf.Variable(tf.zeros((num_hidden, 10)))
b2 = tf.Variable(tf.zeros((1, 10)))
# z -> (batch_size, num_hidden)
z = tf.nn.relu(tf.matmul(x, W1) + b1)
# y -> (batch_size, 10)
y = tf.nn.softmax(tf.matmul(z, W2) + b2)
# y_ -> (batch_size, 10)
y_ = tf.placeholder(tf.float32, [None, 10])
# y_ * tf.log(y) -> (batch_size, 10)
cross_entropy = -tf.reduce_sum(y_ * tf.log(y+1e-10))
train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
# tf.argmax(y, axis=1) returns the maximum index in each row
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
for epoch in range(1000):
# batch_xs -> (100, 784)
# batch_ys -> (100, 10), one-hot encoded
batch_xs, batch_ys = mnist.train.next_batch(100)
train_data = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=train_data)
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
W1_e, b1_e, W2_e, b2_e = W1.eval(), b1.eval(), W2.eval(), b2.eval()
sess.close()
What I Have Done
I checked many the official docs and many other implementations, but I feel totally confused since they may use different versions and API varies greatly.
So could someone help me, thank you in advance.
There are two problems with what you have done so far. First, you have initialised all of the weights to zero, which will prevent the network from learning. And secondly, the learning rate was too high. The below code got me 0.9665 accuracy. For why not to set all the weights to zero you can see here .
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
num_hidden = 100
# x -> (batch_size, 784)
x = tf.placeholder(tf.float32, [None, 784])
label_place = tf.placeholder(tf.float32, [None, 10])
# WONT WORK as EVERYTHING IS ZERO!
# # Get accuracy at chance \approx 0.1
# W1 = tf.Variable(tf.zeros((784, num_hidden)))
# b1 = tf.Variable(tf.zeros((1, num_hidden)))
# W2 = tf.Variable(tf.zeros((num_hidden, 10)))
# b2 = tf.Variable(tf.zeros((1, 10)))
# Will work, you will need to train a bit more than 1000 steps
# though
W1 = tf.Variable(tf.random_normal((784, num_hidden), 0., 0.1))
b1 = tf.Variable(tf.zeros((1, num_hidden)))
W2 = tf.Variable(tf.random_normal((num_hidden, 10), 0, 0.1))
b2 = tf.Variable(tf.zeros((1, 10)))
# network, we only go as far as the linear output after the hidden layer
# so we can feed it into the tf.nn.softmax_cross_entropy_with_logits below
# this is more numerically stable
z = tf.nn.relu(tf.matmul(x, W1) + b1)
logits = tf.matmul(z, W2) + b2
# define our loss etc as before. however note that the learning rate is lower as
# with a higher learning rate it wasnt really working
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=label_place, logits=logits)
train_step = tf.train.GradientDescentOptimizer(.001).minimize(cross_entropy)
# continue as before
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
correct_prediction = tf.equal(tf.argmax(tf.nn.softmax(logits), 1), tf.argmax(label_place, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
for epoch in range(5000):
batch_xs, batch_ys = mnist.train.next_batch(100)
train_data = {x: batch_xs, label_place: batch_ys}
sess.run(train_step, feed_dict=train_data)
print(sess.run(accuracy, feed_dict={x: mnist.test.images, label_place: mnist.test.labels}))
W1_e, b1_e, W2_e, b2_e = W1.eval(), b1.eval(), W2.eval(), b2.eval()
sess.close()

How to switch from GradientDescent Optimizer to Adam in Tensorflow

My code is running perfectly with Gradient Descent, but I want to compare the effectiveness of my algorithm using Adam Optimizer, so I tried to modify the following code:
# Import MNIST data
#import input_data
#mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
#fashion_mnist = input_data.read_data_sets('data/fashion')
import tensorflow as tf
# Set parameters
learning_rate = 0.01 #1e-4
training_iteration = 30
batch_size = 100
display_step = 2
# TF graph input
x = tf.placeholder("float", [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder("float", [None, 10]) # 0-9 digits recognition => 10 classes
#regularizer = tf.reduce_sum(tf.square(y))
# Create a model
# Set model weights
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
with tf.name_scope("Wx_b") as scope:
# Construct a linear model
model = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax
# Add summary ops to collect data
w_h = tf.summary.histogram("weights", W)
b_h = tf.summary.histogram("biases", b)
# More name scopes will clean up graph representation
with tf.name_scope("cost_function") as scope:
# Minimize error using cross entropy
# Cross entropy
cost_function = -tf.reduce_sum(y*tf.log(model))
# Create a summary to monitor the cost function
tf.summary.scalar("cost_function", cost_function)
with tf.name_scope("train") as scope:
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
# Initializing the variables
#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
# Merge all summaries into a single operator
merged_summary_op = tf.summary.merge_all()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
summary_writer = tf.summary.FileWriter('/home/raed/Tensorflow/tensorflow_demo', graph_def =sess.graph_def)
#writer.add_graph(sess.graph_def)
# Training cycle
for iteration in range(training_iteration):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
# Fit training using batch data
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
# Compute the average loss
avg_cost += sess.run(cost_function, feed_dict={x: batch_xs, y: batch_ys})/total_batch
# Write logs for each iteration
summary_str = sess.run(merged_summary_op, feed_dict={x: batch_xs, y: batch_ys})
summary_writer.add_summary(summary_str, iteration*total_batch + i)
# Display logs per iteration step
if iteration % display_step == 0:
print ("Iteration:" "%04d" % (iteration + 1), "cost=", "{:.9f}".format(avg_cost))
print ("Tuning completed!")
# Test the model
predictions = tf.equal(tf.argmax(model, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(predictions, "float"))
print ("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
to use Adam Optimizer I tried to change the following line :
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)
and replace it with the AdamOptimizer :
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_function)
when I ran the code , I got few iteration and then it stopped with the following error.
InvalidArgumentError (see above for traceback): Nan in summary histogram for: weights
[[Node: weights = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](weights/tag, Variable/read)]]
could you please help me understnad the problem , thanks in advance
the problem is weights are initialized to zero W = tf.Variable(tf.zeros([784, 10])) that`s why you re get Nan as weights.
you need to inialize them with some initializer i.e normal distribution as follow
W = tf.Variable(tf.random_normal([784, 10], stddev=0.35),
name="weights")

TensorBoard shows No image data was found

I have implemented a NN for MNIST using TensorFlow. I want to show the result on the TensorBoard. Flowing are screenshots of the TensorBoard that I have implemented. But the IMAGES page shows "No image data was found".
What information should be shown here? I should ignore it?
CODE
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
tf.reset_default_graph()
mnist = input_data.read_data_sets('data', one_hot=True)
batch_size = 100
learning_rate = 0.5
training_epochs = 5
logs_path = "C:/tmp/mlp"
with tf.name_scope('input'):
x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
with tf.name_scope("weights"):
W = tf.Variable(tf.zeros([784, 10]))
with tf.name_scope("biases"):
b = tf.Variable(tf.zeros([10]))
with tf.name_scope("softmax"):
y = tf.nn.softmax(tf.matmul(x, W) + b)
with tf.name_scope('cross_entropy'):
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
with tf.name_scope('train'):
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
with tf.name_scope('Accuracy'):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar("cost", cross_entropy)
tf.summary.scalar("accuracy", accuracy)
summary_op = tf.summary.merge_all()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
summary_writer = tf.summary.FileWriter("C:/tmp/mlp", sess.graph)
for epoch in range(training_epochs):
batch_count = int(mnist.train.num_examples / batch_size)
for i in range(batch_count):
batch_x, batch_y = mnist.train.next_batch(batch_size)
_, summary = sess.run([train_op, summary_op], feed_dict={x: batch_x, y_: batch_y})
summary_writer.add_summary(summary, epoch * batch_count + i)
if epoch % 5 == 0:
print("Epoch: ", epoch)
print("Accuracy: ", accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
print("done")
The only lines in your code that refer to a summary operation are:
tf.summary.scalar("cost", cross_entropy)
tf.summary.scalar("accuracy", accuracy)
These lines create 2 scalar summaries (and add the created summary to a default collection that contains every defined summary).
You're not defining any image summary (with tf.summmary.image) thus that tab in tensorboard will be empty.
Just ignore them, Because you did save any tf.summary.image summary, Tensorboard won't show anything in this tab;

Array of optimizers

I want to test a tensorflow classifier with several optimizers. With this code :
optimizers = [
tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy),
tf.train.AdadeltaOptimizer(learning_rate).minimize(cross_entropy),
tf.train.AdagradOptimizer(learning_rate).minimize(cross_entropy),
tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy),
tf.train.FtrlOptimizer(learning_rate).minimize(cross_entropy),
tf.train.ProximalGradientDescentOptimizer(learning_rate).minimize(cross_entropy),
tf.train.ProximalAdagradOptimizer(learning_rate).minimize(cross_entropy),
tf.train.RMSPropOptimizer(learning_rate).minimize(cross_entropy)]
for optimizer in optimizers:
print(optimizer)
I got this error :
TypeError: init() missing 1 required positional argument: 'name'
Any help please.
Following the MNIST tutorial on tensorflow.org and combining this with your array of optimizers I can obtain all accuracy rates. The error message you get seems to come from a different place.
Code:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
learning_rate = 0.5
optimizers = [
tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy),
tf.train.AdadeltaOptimizer(learning_rate).minimize(cross_entropy),
tf.train.AdagradOptimizer(learning_rate).minimize(cross_entropy),
tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy),
tf.train.FtrlOptimizer(learning_rate).minimize(cross_entropy),
tf.train.ProximalGradientDescentOptimizer(learning_rate).minimize(cross_entropy),
tf.train.ProximalAdagradOptimizer(learning_rate).minimize(cross_entropy),
tf.train.RMSPropOptimizer(learning_rate).minimize(cross_entropy)]
for optimizer in optimizers:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
for _ in range(1000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(optimizer, feed_dict={x: batch_xs, y_: batch_ys})
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
Output:
0.9157
0.8832
0.9169
0.098
0.917
0.9149
0.917
0.098