Output of loss is None

Output of loss is None - tensorflow

I have to finetune VGG.There are five convolutional layers and then three fully connected layers. Output from the last fully connected layer is the input of the loss function. Following is my code:
class vgg16:
def __init__(self, imgs1,imgs2, weights=None, sess=None):
self.imgs1 = imgs1
self.imgs2 = imgs2
with tf.variable_scope("siamese") as scope:
self.o1 = self.convlayers(imgs1)
self.fc_layers()
self.loss()
if weights is not None and sess is not None:
self.load_weights(weights, sess)
scope.reuse_variables()
self.o2 = self.convlayers(imgs2)
self.fc_layers()
self.loss()
if weights is not None and sess is not None:
self.load_weights(weights, sess)
#create loss function
def convlayers(self,imgs):
....
# conv1_2
with tf.name_scope('conv1_2') as scope:
......
# pool1
..
)
.....
# pool5
self.pool5 = tf.nn.max_pool(self.conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
def fc_layers(self):
# fc1
with tf.name_scope('fc1') as scope:
....
# fc2
with tf.name_scope('fc2') as scope:
...
# fc3
with tf.name_scope('fc3') as scope:
fc3w = tf.Variable(tf.truncated_normal([4096, 1000],
dtype=tf.float32,
stddev=1e-1), name='weights')
fc3b = tf.Variable(tf.constant(1.0, shape=[1000], dtype=tf.float32),
trainable=True, name='biases')
self.fc3l = tf.nn.bias_add(tf.matmul(self.fc2, fc3w), fc3b)
def load_weights(self, weight_file, sess):
weights = np.load(weight_file)
keys = sorted(weights.keys())
for i, k in enumerate(keys):
print i, k, np.shape(weights[k])
sess.run(self.parameters[i].assign(weights[k]))
def loss(self):
loss=tf.nn.l2_loss(self.fc3l)
self.train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
if __name__ == '__main__':
sess = tf.Session()
imgs1 = tf.placeholder(tf.float32, [None, 224, 224, 3])#jis size ka bhi imaeg hai usko 224x224 may kar diya or RGB chaeay hmay
imgs2 = tf.placeholder(tf.float32, [None, 224, 224, 3])
vgg = vgg16(imgs1,imgs2, 'vgg16_weights.npz', sess)
img1 = imread('laska.png', mode='RGB')
img1 = imresize(img1, (224, 224))
img2 = imread('laska2.jpg', mode='RGB')
img2 = imresize(img2,(224, 224))
prob = sess.run(vgg.train_step, feed_dict={vgg.imgs1: [img1],vgg.imgs2: [img2]})
print('loss is:')
print(prob)
The problem is that the output of prob is None. Kindly indicate what I am doing wrong.
PS: I am following siamese architecture. Input to both branches are different images here.

The op self.train_step does not return anything, it just calculates gradients and updates variables. See here.
What you need to do is to save reference to loss tensor in your vgg16 class like this:
self.loss=tf.nn.l2_loss(self.fc3l)
and then execute both train_step and loss operations in single sess.run:
_, loss_value = sess.run([vgg.train_step, vgg.loss], feed_dict=...)
print('loss is:')
print(loss_value)

Related

tensor flow error: logits and labels must be broadcastable

I am having the following error displayed while trying to get tensorflow running:
InvalidArgumentError: logits and labels must be broadcastable: logits_size=[30,2] labels_size=[8,2]
Below is my code. I obtained parts of the 1st part of the code from https://blog.francium.tech/build-your-own-image-classifier-with-tensorflow-and-keras-dc147a15e38e and the second from https://www.datacamp.com/community/tutorials/cnn-tensorflow-python. I adopted them to something I am working on where I have some images that belong to 2 different classes. For training, each image class are placed in the same training folder and for testing, each image class is placed in the same testing folder. I figure the error is referring to a mismatch between the logits and label. I have tried tweaking the shapes in the weights and biases as defined in the code below, but this didn't solve the issue. I also tried tampering with the batch size, still no solution. Does anyone have any idea what could cause this error? Could it be how I arranged my training and testing set?
ROOT_PATH = "/my/file/path/images"
train_data_directory = os.path.join(ROOT_PATH, "data/train")
test_data_directory = os.path.join(ROOT_PATH, "data/test")
train_data = train_data_directory
test_data = test_data_directory
def one_hot_label(img):
label = img.split('.')[0]
global ohl
ohl = []
if label == 'A':
ohl = np.array([1,0])
elif label == 'B':
ohl = np.array([0,1])
return ohl
def train_data_with_label():
train_images = []
for i in tqdm(os.listdir(train_data)):
path = os.path.join(train_data,i)
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (28,28))
train_images.append([np.array(img), one_hot_label(i)])
shuffle(train_images)
return train_images
def test_data_with_label():
test_images = []
for i in tqdm(os.listdir(test_data)):
path = os.path.join(test_data,i)
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (28,28))
test_images.append([np.array(img), one_hot_label(i)])
shuffle(test_images)
return test_images
training_images = train_data_with_label()
testing_images = test_data_with_label()
#both placeholders are of type float
x = tf.placeholder("float", [None, 28,28,1])
y = tf.placeholder("float", [None, n_classes])
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],padding='SAME')
weights = {
'wc1': tf.get_variable('W0', shape=(3,3,1,32), initializer=tf.contrib.layers.xavier_initializer()),
'wc2': tf.get_variable('W1', shape=(3,3,32,64), initializer=tf.contrib.layers.xavier_initializer()),
'wc3': tf.get_variable('W2', shape=(3,3,64,128), initializer=tf.contrib.layers.xavier_initializer()),
'wd1': tf.get_variable('W3', shape=(4*4*128,128), initializer=tf.contrib.layers.xavier_initializer()),
'out': tf.get_variable('W6', shape=(128,n_classes), initializer=tf.contrib.layers.xavier_initializer()),
}
biases = {
'bc1': tf.get_variable('B0', shape=(32), initializer=tf.contrib.layers.xavier_initializer()),
'bc2': tf.get_variable('B1', shape=(64), initializer=tf.contrib.layers.xavier_initializer()),
'bc3': tf.get_variable('B2', shape=(128), initializer=tf.contrib.layers.xavier_initializer()),
'bd1': tf.get_variable('B3', shape=(128), initializer=tf.contrib.layers.xavier_initializer()),
'out': tf.get_variable('B4', shape=(2), initializer=tf.contrib.layers.xavier_initializer()),
}
def conv_net(x, weights, biases):
# here we call the conv2d function we had defined above and pass the input image x, weights wc1 and bias bc1.
conv1 = conv2d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling), this chooses the max value from a 2*2 matrix window and outputs a 14*14 matrix.
conv1 = maxpool2d(conv1, k=2)
# Convolution Layer
# here we call the conv2d function we had defined above and pass the input image x, weights wc2 and bias bc2.
conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling), this chooses the max value from a 2*2 matrix window and outputs a 7*7 matrix.
conv2 = maxpool2d(conv2, k=2)
conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
# Max Pooling (down-sampling), this chooses the max value from a 2*2 matrix window and outputs a 4*4.
conv3 = maxpool2d(conv3, k=2)
#print(conv3.shape)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv3, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1)
# Output, class prediction
# finally we multiply the fully connected layer with the weights and add a bias term.
out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
print(out.shape)
return out
#print(out.shape)
pred = conv_net(x, weights, biases)
#pred.shape
#labelsa = tf.constant(1., shape=y.shape)
#logsa = tf.constant(1., shape=pred.shape)
#labels = labels + tf.zeros_like(logsa)
print(pred)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=y))
print(y)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
with tf.Session() as sess:
sess.run(init)
train_loss = []
test_loss = []
train_accuracy = []
test_accuracy = []
summary_writer = tf.summary.FileWriter('./Output', sess.graph)
for i in range(training_iters):
#print('here')
for batch in range(len(train_X)//batch_size):
print('here')
#offset = (batch * batch_size) % (train_Y.shape[0] - batch_size)
batch_x = train_X[batch*batch_size:min((batch+1)*batch_size,len(train_X))]
batch_y = train_Y[batch*batch_size:min((batch+1)*batch_size,len(train_Y))]
# Run optimization op (backprop).
# Calculate batch loss and accuracy
print(batch_y.shape)
opt = sess.run(optimizer, feed_dict={x: batch_x,
y: batch_y})
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
y: batch_y})
print("Iter " + str(i) + ", Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
print("Optimization Finished!")
# Calculate accuracy for all 10000 mnist test images
test_acc,valid_loss = sess.run([accuracy,cost], feed_dict={x: test_X,y: test_Y})
train_loss.append(loss)
test_loss.append(valid_loss)
train_accuracy.append(acc)
test_accuracy.append(test_acc)
print("Testing Accuracy:","{:.5f}".format(test_acc))
summary_writer.close()

Weights in the network are not being updated during backpropogation

Here is my feature extracting network.
class vgg16:
def __init__(self, imgs1,imgs2, weights=None, sess=None):
self.imgs1 = imgs1
self.imgs2 = imgs2
with tf.variable_scope("siamese") as scope:
self.o1 = self.convlayers(self.imgs1)
.....
if weights is not None and sess is not None:
self.load_weights(weights, sess)
scope.reuse_variables()
self.o2 = self.convlayers(self.imgs2)
.......
if weights is not None and sess is not None:
self.load_weights(weights, sess)
#create loss function
with tf.variable_scope("loss") as scope:
self.cd=self.cos_dist(self.o1,self.o2,sess)
self.loss(self.cd,self.o1,self.o2)
def convlayers(self,_image):
.....
def load_weights(self, weight_file, sess):
....
def loss(self,la,i1,j2):
label=tf.cast(la, tf.float32)
i=tf.nn.l2_normalize(i1, 1, epsilon=1e-2, name='normed')
j=tf.nn.l2_normalize(j2, 1, epsilon=1e-2, name='normed')
sub= tf.subtract(i,j)
magi=tf.sqrt(tf.reduce_sum(tf.square(sub)))
ab=tf.abs(magi)
sq=tf.square(ab)
self.left=sq*label
#working on right now
second_part=1-label
tpart=tf.constant(0.675, dtype=tf.float32, name='T')-magi
tp=tf.maximum(0.0,tpart)
self.right=second_part*tp
self.final_loss=self.left+self.right
o= tf.train.GradientDescentOptimizer(0.09)
self.l=o.compute_gradients(self.final_loss)
self.o=o.apply_gradients(self.l)
def cos_dist(self,net_1,net_2,sess):
......
if __name__ == '__main__':
# for batch processing
sess = tf.InteractiveSession()
imgs1 = tf.placeholder(tf.float32, [None, 224, 224, 3])
imgs2 = tf.placeholder(tf.float32, [None, 224, 224, 3])
vgg = vgg16(imgs1,imgs2, 'vgg16_weights.npz', sess)
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(vgg.final_loss)
#try:
img1 = imread('c2.jpeg', mode='RGB')
img1 = imresize(img1,(224, 224))
img2 = imread('c7.jpeg', mode='RGB')
img2 = imresize(img2,(224, 224))
M= sess.run([vgg.k,vgg.o,vgg.final_loss],feed_dict={vgg.imgs1: [img1],vgg.imgs2: [img2]})
It is a siames fashioned CNN. The problem is that the latter layers do not get updated. And the first ones are updated to Nan. Is there something wrong with sess.run()? The way I am calling the attributes.
Usually one of the reasons for this is faulty input. And I tried printing the values img1 and img2 and the values contained in the matrices is 255 only

Optimize input image with class prior

I'm trying to implement the first part of the google blog entry
Inceptionism: Going Deeper into Neural Networks in TensorFlow. So far I have found several resources that either explain it in natural language or focus on other parts or give code snippets for other frameworks. I understand the idea of optimizing a random input image with respect to a class prior and also the maths behind it given in the this paper, section 2, but I'm not able to implement it myself using TensorFlow.
From this SO question and the helpful comment by etarion, I now know that you can give a list of variables to the optimizer, while all other variables are untouched. However, when giving the optimizer a random image as a variable leads to
File "mnist_test.py", line 101, in main
optimizer2 = tf.train.AdamOptimizer(learning_rate).minimize(-cost, var_list=[rnd_img])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 198, in minimize
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/optimizer.py", line 309, in apply_gradients
(converted_grads_and_vars,))
ValueError: No gradients provided for any variable: ((None,<tensorflow.python.ops.variables.Variable object at 0x7feac1870410>),)
For testing purpose I used a stripped down MNIST example. I tried to keep it as short as possible while still being readable and executable:
def main():
# parameters
learning_rate = 0.001
train_batches = 1000
batch_size = 128
display_step = 50
# net parameters
n_input = 784 #28x28
n_classes = 10
keep_prob = 0.75
weights = {
'wc1': tf.Variable(tf.truncated_normal([5, 5, 1, 32])),
'wc2': tf.Variable(tf.truncated_normal([5, 5, 32, 64])),
'wd1': tf.Variable(tf.truncated_normal([7*7*64, 1024])),
'out': tf.Variable(tf.truncated_normal([1024, n_classes]))
}
biases = {
'bc1': tf.Variable(tf.constant(0.1, shape=[32])),
'bc2': tf.Variable(tf.constant(0.1, shape=[64])),
'bd1': tf.Variable(tf.constant(0.1, shape=[1024])),
'out': tf.Variable(tf.constant(0.1, shape=[n_classes]))
}
# tf inputs
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
dropout = tf.placeholder(tf.float32)
# create net
net = create_net(x, weights, biases, keep_prob)
# define loss
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(net, y))
# define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# evaluation
pred_correct = tf.equal(tf.argmax(net, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(pred_correct, tf.float32))
print "loading mnist data"
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(train_batches):
batch_x, batch_y = mnist.train.next_batch(batch_size)
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, dropout: keep_prob})
if i % display_step == 0:
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y, dropout: 1.0})
print "batch: %i, loss: %.5f, accuracy: %.5f" % (i, loss, acc)
acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, dropout: 1.0})
print "test accuracy: %.5f" % (acc)
# ====== this is where the reconstruction begins =====
rnd_img = tf.Variable(tf.random_normal([1, n_input]))
one_hot = np.zeros(10)
one_hot[4] = 1;
# the next line causes the error
optimizer2 = tf.train.AdamOptimizer(learning_rate).minimize(-cost, var_list=[rnd_img])
for i in xrange(1000):
session.run(optimizer2, feed_dict={x: rnd_img, y: one_hot, dropout: 1.0})
sess.close()
if __name__ == "__main__":
main()
The helper functions I used:
def create_net(x, weights, biases, dropout):
x = tf.reshape(x, shape=[-1, 28, 28, 1])
conv1 = conv2d_relu(x, weights['wc1'], biases['bc1'])
conv1 = maxpool2d(conv1, 2)
conv2 = conv2d_relu(conv1, weights['wc2'], biases['bc2'])
conv2 = maxpool2d(conv2, 2)
fc1 = fullyconnected_relu(conv2, weights['wd1'], biases['bd1'])
fc1 = tf.nn.dropout(fc1, dropout)
out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
return out
def conv2d_relu(x, W, b, stride=1):
conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
conv = tf.nn.bias_add(conv, b)
return tf.nn.relu(conv)
def maxpool2d(x, k=2, stride=2, padding='VALID'):
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, stride, stride, 1], padding=padding)
def fullyconnected_relu(x, W, b):
fc = tf.reshape(x, [-1, W.get_shape().as_list()[0]])
fc = tf.add(tf.matmul(fc, W), b)
fc = tf.nn.relu(fc)
I've found some sources saying that this error occurs when there is no path within the computation graph between the output and the variables to be optimize, but I don't see why this should be the case here.
My questions are:
Why isn't the optimizer able to apply any gradients?
Is this the right way to go in order to implement the visualization of a class?
Thanks in advance.
Edit:
Here is the complete code again, after incorporation of the accepted answer (for anyone who is interested). Anyway, the results are still not as expected, as the script basically produces random images after 100000 rounds of reconstruction. Ideas are welcome.
import tensorflow as tf
import numpy as np
import skimage.io
def conv2d_relu(x, W, b, stride=1):
conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
conv = tf.nn.bias_add(conv, b)
return tf.nn.relu(conv)
def maxpool2d(x, k=2, stride=2, padding='VALID'):
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, stride, stride, 1], padding=padding)
def fullyconnected_relu(x, W, b):
fc = tf.reshape(x, [-1, W.get_shape().as_list()[0]])
fc = tf.add(tf.matmul(fc, W), b)
fc = tf.nn.relu(fc)
return fc;
def create_net(x, weights, biases, dropout):
x = tf.reshape(x, shape=[-1, 28, 28, 1])
conv1 = conv2d_relu(x, weights['wc1'], biases['bc1'])
conv1 = maxpool2d(conv1, 2)
conv2 = conv2d_relu(conv1, weights['wc2'], biases['bc2'])
conv2 = maxpool2d(conv2, 2)
fc1 = fullyconnected_relu(conv2, weights['wd1'], biases['bd1'])
fc1 = tf.nn.dropout(fc1, dropout)
out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
return out
def save_image(img_data, name):
img = img_data.reshape(28,28)
mi = np.min(img)
ma = np.max(img)
img = (img-mi)/(ma-mi)
skimage.io.imsave(name, img)
def main():
# parameters
learning_rate = 0.001
train_batches = 1000
batch_size = 100
display_step = 50
# net parameters
n_input = 784 #28x28
n_classes = 10
keep_prob = 0.75
weights = {
'wc1': tf.Variable(tf.truncated_normal([5, 5, 1, 32])),
'wc2': tf.Variable(tf.truncated_normal([5, 5, 32, 64])),
'wd1': tf.Variable(tf.truncated_normal([7*7*64, 1024])),
'out': tf.Variable(tf.truncated_normal([1024, n_classes]))
}
biases = {
'bc1': tf.Variable(tf.constant(0.1, shape=[32])),
'bc2': tf.Variable(tf.constant(0.1, shape=[64])),
'bd1': tf.Variable(tf.constant(0.1, shape=[1024])),
'out': tf.Variable(tf.constant(0.1, shape=[n_classes]))
}
# tf inputs
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
dropout = tf.placeholder(tf.float32)
# create net
net = create_net(x, weights, biases, dropout)
# define loss
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(net, y))
# define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# evaluation
pred_correct = tf.equal(tf.argmax(net, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(pred_correct, tf.float32))
print "loading mnist data"
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(train_batches):
batch_x, batch_y = mnist.train.next_batch(batch_size)
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, dropout: keep_prob})
if i % display_step == 0:
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y, dropout: 1.0})
print "batch: %i, loss: %.5f, accuracy: %.5f" % (i, loss, acc)
acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, dropout: 1.0})
print "test accuracy: %.5f" % (acc)
# reconstruction part
rnd_img = tf.Variable(tf.random_normal([1, n_input]))
one_hot = np.zeros((1, 10))
one_hot[0,1] = 1;
net2 = create_net(rnd_img, weights, biases, dropout)
cost2 = -tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(net2, y))
optimizer2 = tf.train.AdamOptimizer(learning_rate).minimize(cost2, var_list=[rnd_img])
init_var_list = []
for var in tf.all_variables():
if(not tf.is_variable_initialized(var).eval(session=sess)):
init_var_list.append(var)
sess.run(tf.initialize_variables(init_var_list))
save_image(rnd_img.eval(sess), "bevor.tiff")
for i in xrange(100000):
_, loss = sess.run([optimizer2, cost2], feed_dict={y: one_hot, dropout: 1.0})
if(i%10000 == 0):
cur_img = rnd_img.eval(session=sess)
print "loss:", loss, "mi:", np.min(cur_img), "ma:", np.max(cur_img)
save_image(rnd_img.eval(sess), "after.tiff")
sess.close()
if __name__ == "__main__":
main()
Some explanation: After rebuilding the graph with the new input variable and optimizer, I had to initialize the new variables, i.e. the rnd_img and some helper variables used by the Adam optimizer, hence the loop over all_variables() and checking for initialization status. If somebody knows a more elegant way, let me know. Or maybe that's the reason why I don't get any results?

The rnd_img needs to part of the graph that you optimize. In your case, you just create a variable and tell the optimizer to optimize it, but the variable is not connected to the loss in the graph. What you can for example do is use another call to create_net with rnd_image instead of x (but using the same weights!), create the cost for that and then create a minimization op for that cost. Then for optimization you only feed in y.

Tensorflow network too large for GPU memory

I have a rather large network, and my GPU is running out of memory. It isn't a bug in any of the code, the network itself is simply too large to fit into memory. I've even tried the GPU config recommendations here.
For example, I've tried both of the gpu_options below...
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
config.gpu_options.allow_growth = True
# config.optimizer_options.opt_level = 2
# config.graph_options.enable_recv_scheduling = True
# config.graph_options.build_cost_model = 1
config.gpu_options.per_process_gpu_memory_fraction = 0.1
But I'm still running out of memory. I have been informed by GitHub user #girving here that Tensorflow doesn't handle Memory overflow (which makes no sense to me why they wouldn't implement this).
However, he also claimed there are workarounds. I can't find any support of anyone who has had to implement a workaround. Can anyone point me in the right direction? Can I implement Queuing somehow?
For reference, here is some code... the program runs out of memory at the time of sess(init)
#Kendall Weihe
#This is a CNN that handles 3D data
#Adjust network parameters below, also adjust data directory
import tensorflow as tf
import pdb
import numpy as np
from numpy import genfromtxt
from PIL import Image
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.grid_rnn.python.ops import grid_rnn_cell
from tensorflow.tensorflow.scroll import scroll_data
# Parameters
learning_rate = 0.001
training_iters = 1000000
batch_size = 1
display_step = 1
# Network Parameters
n_images = 100
n_input_x = 396 # Input image x-dimension
n_input_y = 396 # Input image y-dimension
n_input_z = 5
n_hidden = 128
n_classes = 2 # Binary classification -- on a surface or not
n_output = n_input_x * n_classes
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input_z, n_input_x, n_input_y])
y = tf.placeholder(tf.float32, [None, n_input_z, n_input_x, n_input_y, n_classes], name="ground_truth")
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
def input_data():
data = np.empty((n_images, n_input_x, n_input_y))
temp = []
for i in range(n_images):
filename = "/home/volcart/Documents/Data/input_crops/cropped00" + str(i) + ".tif"
im = Image.open(path)
imarray = np.array(im)
temp.append(imarray)
for i in range(n_images):
for j in range(n_input_x):
for k in range(n_input_y):
data[i][j][k] = temp[i][j][k]
return data
# Create some wrappers for simplicity
def conv3d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv3d(x, W, strides=[1, strides, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool3d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool3d(x, ksize=[1, k, k, k, 1], strides=[1, k, k, k, 1],
padding='SAME')
def deconv3d(prev_layer, w, b, output_shape, strides):
# Deconv layer
deconv = tf.nn.conv3d_transpose(prev_layer, w, output_shape=output_shape, strides=strides, padding="VALID")
deconv = tf.nn.bias_add(deconv, b)
deconv = tf.nn.relu(deconv)
return deconv
# Create model
def conv_net(x, weights, biases, dropout):
# Reshape input picture
x = tf.reshape(x, shape=[-1, n_input_z, n_input_x, n_input_y, 1])
with tf.name_scope("conv1") as scope:
# Convolution Layer
conv1 = conv3d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
#conv1 = tf.nn.local_response_normalization(conv1)
conv1 = maxpool3d(conv1, k=2)
# Convolution Layer
with tf.name_scope("conv2") as scope:
conv2 = conv3d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling)
# conv2 = tf.nn.local_response_normalization(conv2)
conv2 = maxpool3d(conv2, k=2)
# Convolution Layer
with tf.name_scope("conv3") as scope:
conv3 = conv3d(conv2, weights['wc3'], biases['bc3'])
# Max Pooling (down-sampling)
# conv3 = tf.nn.local_response_normalization(conv3)
conv3 = maxpool3d(conv3, k=2)
# pdb.set_trace()
temp_batch_size = tf.shape(x)[0] #batch_size shape
with tf.name_scope("deconv1") as scope:
output_shape = [temp_batch_size, 2, n_input_x / 4, n_input_y / 4, 16]
strides = [1,2,2,2,1]
#conv4 = deconv3d(conv3, weights['wdc1'], biases['bdc1'], output_shape, strides)
# conv4 = tf.nn.local_response_normalization(conv4)
conv4 = tf.nn.conv3d_transpose(conv3, weights['wdc1'], output_shape=output_shape, strides=strides, padding="SAME")
conv4 = tf.nn.bias_add(conv4, biases['bdc1'])
conv4 = tf.nn.relu(conv4)
with tf.name_scope("deconv2") as scope:
output_shape = [temp_batch_size, 3, n_input_x / 2, n_input_y / 2, 8]
strides = [1,1,2,2,1]
conv5 = deconv3d(conv4, weights['wdc2'], biases['bdc2'], output_shape, strides)
# conv5 = tf.nn.local_response_normalization(conv5)
with tf.name_scope("deconv3") as scope:
output_shape = [temp_batch_size, n_input_z, n_input_x, n_input_y, 1]
#this time don't use ReLu -- since output layer
conv6 = tf.nn.conv3d_transpose(conv5, weights['wdc3'], output_shape=output_shape, strides=[1,1,2,2,1], padding="VALID")
conv6 = tf.nn.bias_add(conv6, biases['bdc3'])
conv6 = tf.nn.dropout(conv6, dropout)
# conv6 = tf.nn.relu(conv6)
# pdb.set_trace()
x = tf.reshape(conv6, [-1, n_input_x])
x = tf.split(0, n_input_y * n_input_z, x)
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True, activation=tf.nn.relu)
# lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * n_hidden, state_is_tuple=True)
lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=0.75)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
output = []
for i in xrange(n_input_y * n_input_z):
output.append(tf.matmul(outputs[i], lstm_weights[i]) + lstm_biases[i])
return output
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1' : tf.Variable(tf.random_normal([2, 2, 2, 1, 8])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2' : tf.Variable(tf.random_normal([2, 2, 2, 8, 16])),
# 5x5 conv, 32 inputs, 64 outputs
'wc3' : tf.Variable(tf.random_normal([2, 2, 2, 16, 32])),
'wdc1' : tf.Variable(tf.random_normal([2, 2, 2, 16, 32])),
'wdc2' : tf.Variable(tf.random_normal([2, 2, 2, 8, 16])),
'wdc3' : tf.Variable(tf.random_normal([3, 2, 2, 1, 8])),
}
biases = {
'bc1': tf.Variable(tf.random_normal([8])),
'bc2': tf.Variable(tf.random_normal([16])),
'bc3': tf.Variable(tf.random_normal([32])),
'bdc1': tf.Variable(tf.random_normal([16])),
'bdc2': tf.Variable(tf.random_normal([8])),
'bdc3': tf.Variable(tf.random_normal([1])),
}
lstm_weights = {}
lstm_biases = {}
for i in xrange(n_input_y * n_input_z):
lstm_weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
lstm_biases[i] = tf.Variable(tf.random_normal([n_output]))
# Construct model
with tf.name_scope("net") as scope:
print "Building network..."
pred = conv_net(x, weights, biases, keep_prob)
print "Network built!"
# pdb.set_trace()
pred = tf.transpose(tf.pack(pred),[1,0,2])
pred = tf.reshape(pred, [-1, n_input_z, n_input_x, n_input_y, n_classes])
# Reshape for cost function
temp_pred = tf.reshape(pred, [-1, n_classes])
temp_y = tf.reshape(y, [-1, n_classes])
with tf.name_scope("loss") as scope:
# cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(pred, y))
cost = (tf.nn.sigmoid_cross_entropy_with_logits(temp_pred, temp_y))
with tf.name_scope("opt") as scope:
print "Initializing optimizer..."
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
print "optimizer initialized!"
# pdb.set_trace()
# Evaluate model
with tf.name_scope("acc") as scope:
# accuracy is the difference between prediction and ground truth matrices
correct_pred = tf.equal(0,tf.cast(tf.sub(tf.nn.sigmoid(temp_pred),temp_y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
with tf.name_scope("prediction-node") as scope:
prediction_node = tf.nn.sigmoid(temp_pred)
# Initializing the variables
with tf.name_scope("initialize-and-config") as scope:
print "Initializing variables & configuring..."
init = tf.initialize_all_variables()
saver = tf.train.Saver()
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
config.gpu_options.allow_growth = True
# config.optimizer_options.opt_level = 2
# config.graph_options.enable_recv_scheduling = True
# config.graph_options.build_cost_model = 1
config.gpu_options.per_process_gpu_memory_fraction = 0.1
print "Variables and configurations initialized!"
# Launch the graph
with tf.Session(config=config) as sess:
print "Initializing session..."
sess.run(init)
print "Session initialized!"
print "Restoring session..."
saver.restore(sess, "/home/volcart/Documents/3D-CNN-2D-LSTM-reg-model/model.ckpt")
print "Session restored!"
tf.get_default_graph().finalize()
# Import data
print "Importing data..."
data = input_data()
print "Data imported!"
# Keep training until reach max iterations
for i in range(n_images):
print "Prediction image number -- " + str(i)
temp = []
for j in range(n_input_z):
temp.append(data[j,:,:])
temp = np.asarray(temp)
temp = temp.reshape((1, n_input_z, n_input_x, n_input_y))
prediction = sess.run(prediction_node, feed_dict={x: temp, keep_prob: 1.0})
prediction = prediction.reshape((n_input_x, n_input_y, n_classes))
temp_arr1 = np.empty((n_input_x, n_input_y))
for i in xrange(n_input_x):
for j in xrange(n_input_y):
if l == 0:
temp_arr1[i][j] = prediction[i][j][0]
csv_file = "/home/volcart/Documents/3D-CNN-2D-LSTM-pred/3D-CNN-2D-LSTM-step-" + str(i) + ".csv"
np.savetxt(csv_file, temp_arr1, delimiter=",")

Tensorflow training got stuck after some steps, how to investigate?

I have a python script to train a Tensorflow model similar to the one in CIFAR-10 tutorial. I have 20500 training examples and am using 128 examples per batch. I set 1,000,000 as the max number of steps. However after about 164,000 steps, the python script seems stuck somewhere. Is there any way to find out where the script is stuck? My last resort would be using Ctrl-C to terminate the process and force it to print out a backtrace. But I wonder if there are other things I should check before I kill the process.
Here's the train loop:
def train(trainingData, batchSize, workingDir, maxSteps):
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
image, label = readData(trainingData)
minAfterDequeue = 5000
capacity = minAfterDequeue + 3 * batchSize
imageBatch, labelBatch = tf.train.shuffle_batch([image, label], batch_size=batchSize, capacity=capacity, min_after_dequeue=minAfterDequeue)
#labelBatch = tf.reshape(labelBatch, [batchSize, 1])
#tf.image_summary('images', imageBatch)
#tf.histogram_summary('labels', tf.cast(labelBatch, tf.float32))
logits = network.inference(imageBatch, 0.5)
#floatLabel = tf.cast(labelBatch, tf.float32)
#cross_entropy_per_example = tf.nn.softmax_cross_entropy_with_logits(logits, floatLabel)
loss, cross_entropy = network.loss(logits, labelBatch)
train_op = network.train(loss, global_step, batchSize)
# Create a saver
saver = tf.train.Saver(tf.all_variables())
summary_op = tf.merge_all_summaries()
session = tf.Session()
init = tf.initialize_all_variables()
session.run(init)
tf.train.start_queue_runners(sess=session)
summary_writer = tf.train.SummaryWriter(workingDir, session.graph_def)
for step in xrange(maxSteps):
start_time = time.time()
#l, sm, ce = session.run([floatLabel, logits, cross_entropy_per_example])
#print l
#print sm
#print ce
_, loss_value = session.run([train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
examples_per_sec = batchSize / duration
format_str = "%s: step %d, loss = %e (%.1f examples/sec; %.3f sec/batch"
print (format_str % (datetime.now(), step, loss_value, examples_per_sec, float(duration)))
if step % 100 == 0:
summary_str = session.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 1000 == 0 or (step + 1) == maxSteps:
checkpoint_path = os.path.join(workingDir, 'model.ckpt')
saver.save(session, checkpoint_path, global_step = step)
And here's the various functions used to construct the graph:
import re
import tensorflow as tf
TOWER_NAME="tower"
NUM_EXAMPLES_PER_EPOCH = 50000
# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.95 # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.01 # Initial learning rate.
def _activation_summary(x):
"""Helper to create summaries for activations.
Creates a summary that provides a histogram of activations.
Creates a summary that measure the sparsity of activations.
Args:
x: Tensor
Returns:
nothing
"""
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
tf.histogram_summary(tensor_name + '/activations', x)
tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
#numChannel = tf.shape(x)[3]
#tf.image_summary(tensor_name + '/image', tf.reshape(x)
def _variable_on_cpu(name, shape, initializer):
"""Helper to create a Variable stored on CPU memory.
Args:
name: name of the variable
shape: list of ints
initializer: initializer for Variable
Returns:
Variable Tensor
"""
with tf.device('/cpu:0'):
var = tf.get_variable(name, shape, initializer=initializer, dtype=tf.float32)
return var
def _variable_with_weight_decay(name, shape, stddev, wd=None):
"""Helper to create an initialized Variable with weight decay.
Note that the Variable is initialized with a truncated normal distribution.
A weight decay is added only if one is specified.
Args:
name: name of the variable
shape: list of ints
stddev: standard deviation of a truncated Gaussian
wd: add L2Loss weight decay multiplied by this float. If None, weight
decay is not added for this Variable.
Returns:
Variable Tensor
"""
var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev))
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
def inference(images, dropout):
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 1, 32], stddev=5e-2)
conv = tf.nn.conv2d(images, kernel, [1,1,1,1], padding='SAME')
biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv1)
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1')
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights', shape=[3, 3, 32, 64], stddev=5e-2)
conv = tf.nn.conv2d(pool1, kernel, [1,1,1,1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv2)
# pool2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2')
# conv3
with tf.variable_scope('conv3') as scope:
kernel = _variable_with_weight_decay('weights', shape=[3, 3, 64, 64], stddev=5e-2)
conv = tf.nn.conv2d(pool2, kernel, [1,1,1,1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv3 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv3)
# pool 3
pool3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool3')
# fully connected 4
with tf.variable_scope('full4') as scope:
batchSize = pool3.get_shape()[0].value
flattened = tf.reshape(pool3, [batchSize, -1])
dim = flattened.get_shape()[1].value
weights = _variable_with_weight_decay('weights', shape=[dim, 256], stddev=5e-2)
biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.1))
full4 = tf.nn.relu(tf.matmul(flattened, weights) + biases, name=scope.name)
full4_dropout = tf.nn.dropout(full4, dropout)
_activation_summary(full4)
#_activation_summary(full4_dropout)
# fully connected 5
with tf.variable_scope('full5') as scope:
weights = _variable_with_weight_decay('weights', [256, 128], stddev=5e-2)
biases = _variable_on_cpu('biases', [128], tf.constant_initializer(0.1))
full5 = tf.nn.relu(tf.matmul(full4_dropout, weights) + biases, name=scope.name)
full5_dropout = tf.nn.dropout(full5, dropout)
_activation_summary(full5)
#_activation_summary(full5_dropout)
# softmax
with tf.variable_scope('softmax_linear') as scope:
weights = _variable_with_weight_decay('weights', [128, 2], stddev=1/128.0)
biases = _variable_on_cpu('biases', [2], tf.constant_initializer(0.0))
softmax_linear = tf.add(tf.matmul(full5_dropout, weights), biases, name=scope.name)
_activation_summary(softmax_linear)
return softmax_linear
def loss(logits, labels):
labels = tf.cast(labels, tf.float32)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
return tf.add_n(tf.get_collection('losses'), name='total_loss'), cross_entropy_mean
def _add_loss_summaries(total_loss):
"""Add summaries for losses in CIFAR-10 model.
Generates moving average for all losses and associated summaries for
visualizing the performance of the network.
Args:
total_loss: Total loss from loss().
Returns:
loss_averages_op: op for generating moving averages of losses.
"""
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name +' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
return loss_averages_op
def train(loss, step, batchSize):
numBatchesPerEpoch = NUM_EXAMPLES_PER_EPOCH / batchSize
decay_steps = int(numBatchesPerEpoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
tf.scalar_summary('learning_rate', lr)
loss_averages_op = _add_loss_summaries(loss)
# compute gradients
with tf.control_dependencies([loss_averages_op]):
opt = tf.train.GradientDescentOptimizer(lr)
grads = opt.compute_gradients(loss)
# apply gradients
apply_gradient_op = opt.apply_gradients(grads, global_step = step)
# add histograms for trainable variables
for var in tf.trainable_variables():
tf.histogram_summary(var.op.name, var)
# add histograms for gradients:
for grad, var in grads:
if grad is not None:
tf.histogram_summary(var.op.name + '/gradients', grad)
# Track the moving average of all trainable variables
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, step)
variable_averages_op = variable_averages.apply(tf.trainable_variables())
with tf.control_dependencies([apply_gradient_op, variable_averages_op]):
train_op = tf.no_op(name='train')
return train_op

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Output of loss is None - tensorflow

Related

tensor flow error: logits and labels must be broadcastable

Weights in the network are not being updated during backpropogation

Optimize input image with class prior

Tensorflow network too large for GPU memory

Tensorflow training got stuck after some steps, how to investigate?

Categories

Resources