Fully Convolutional Network, Training Error - tensorflow

I apologize that I'm not good at English.
I'm trying to build my own Fully Convolutional Network using TensorFlow.
But I have difficulties on training this model with my own image data, whereas the MNIST data worked properly.
Here is my FCN model code: (Not using pre-trained or pre-bulit model)
import tensorflow as tf
import numpy as np
Loading MNIST Data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
images_flatten = tf.placeholder(tf.float32, shape=[None, 784])
images = tf.reshape(images_flatten, [-1,28,28,1]) # CNN deals with 3 dimensions
labels = tf.placeholder(tf.float32, shape=[None, 10])
keep_prob = tf.placeholder(tf.float32) # Dropout Ratio
Convolutional Layers
# Conv. Layer #1
W1 = tf.Variable(tf.truncated_normal([3, 3, 1, 4], stddev = 0.1))
b1 = tf.Variable(tf.truncated_normal([4], stddev = 0.1))
FMA = tf.nn.conv2d(images, W1, strides=[1,1,1,1], padding='SAME')
# FMA stands for Fused Multiply Add, which means convolution
RELU = tf.nn.relu(tf.add(FMA, b1))
POOL = tf.nn.max_pool(RELU, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
# Conv. Layer #2
W2 = tf.Variable(tf.truncated_normal([3, 3, 4, 8], stddev = 0.1))
b2 = tf.Variable(tf.truncated_normal([8], stddev = 0.1))
FMA = tf.nn.conv2d(POOL, W2, strides=[1,1,1,1], padding='SAME')
RELU = tf.nn.relu(tf.add(FMA, b2))
POOL = tf.nn.max_pool(RELU, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
# Conv. Layer #3
W3 = tf.Variable(tf.truncated_normal([7, 7, 8, 16], stddev = 0.1))
b3 = tf.Variable(tf.truncated_normal([16], stddev = 0.1))
FMA = tf.nn.conv2d(POOL, W3, strides=[1,1,1,1], padding='VALID')
RELU = tf.nn.relu(tf.add(FMA, b3))
# Dropout
Dropout = tf.nn.dropout(RELU, keep_prob)
# Conv. Layer #4
W4 = tf.Variable(tf.truncated_normal([1, 1, 16, 10], stddev = 0.1))
b4 = tf.Variable(tf.truncated_normal([10], stddev = 0.1))
FMA = tf.nn.conv2d(Dropout, W4, strides=[1,1,1,1], padding='SAME')
LAST_RELU = tf.nn.relu(tf.add(FMA, b4))
Summary: [Conv-ReLU-Pool] - [Conv-ReLU-Pool] - [Conv-ReLU] - [Dropout] - [Conv-ReLU]
Define Loss, Accuracy
prediction = tf.squeeze(LAST_RELU)
# Because FCN returns (1 x 1 x class_num) in training
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction, labels))
# First arg is 'logits=' and the other one is 'labels='
optimizer = tf.train.AdamOptimizer(0.001)
train = optimizer.minimize(loss)
label_max = tf.argmax(labels, 1)
pred_max = tf.argmax(prediction, 1)
correct_pred = tf.equal(pred_max, label_max)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
Training Model
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(10000):
image_batch, label_batch = mnist.train.next_batch(100)
sess.run(train, feed_dict={images: image_batch, labels: label_batch, keep_prob: 0.8})
if i % 10 == 0:
tr = sess.run([loss, accuracy], feed_dict={images: image_batch, labels: label_batch, keep_prob: 1.0})
print("Step %d, Loss %g, Accuracy %g" % (i, tr[0], tr[1]))
Loss: 0.784 (Approximately)
Accuracy: 94.8% (Approximately)
The problem is that, training this model with MNIST data worked very well, but with my own data, loss is always same(0.6319), and the output layer is always 0.
There is no difference with the code, excepting for the third convolutional layer's filter size. This filter size and input size which is compressed by previous pooling layers, must have same width & height. That's why the filter size in this layer is [7,7].
What is wrong with my model?..
The only different code between two cases (MNIST, my own data) is:
Placeholder
My own data has (128 x 64 x 1) and the label is 'eyes', 'not_eyes'
images = tf.placeholder(tf.float32, [None, 128, 64, 1])
labels = tf.placeholder(tf.int32, [None, 2])
3rd Convolutional Layer
W3 = tf.Variable(tf.truncated_normal([32, 16, 8, 16], stddev = 0.1))
Feeding (Batch)
image_data, label_data = input_data.get_batch(TRAINING_FILE, 10)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in range(10000):
image_batch, label_batch = sess.run([image_data, label_data])
sess.run(train, feed_dict={images: image_batch, labels: label_batch, keep_prob: 0.8})
if i % 10 == 0: ... # Validation part is almost same, too...
coord.request_stop()
coord.join(threads)
Here "input_data" is an another python file in the same directory, and "get_batch(TRAINING_FILE, 10)" is the function that returns batch data. The code is:
def get_input_queue(txtfile_name):
images = []
labels = []
for line in open(txtfile_name, 'r'): # Here txt file has data's path, label, label number
cols = re.split(',|\n', line)
labels.append(int(cols[2]))
images.append(tf.image.decode_jpeg(tf.read_file(cols[0]), channels = 1))
input_queue = tf.train.slice_input_producer([images, labels], shuffle = True)
return input_queue
def get_batch(txtfile_name, batch_size):
input_queue = get_input_queue(txtfile_name)
image = input_queue[0]
label = input_queue[1]
image = tf.reshape(image, [128, 64, 1])
batch_image, batch_label = tf.train.batch([image, label], batch_size)
batch_label_one_hot = tf.one_hot(tf.to_int64(batch_label), 2, on_value=1.0, off_value=0.0)
return batch_image, batch_label_one_hot
It seems not to have any problem .... :( Please Help me..!!

Are your inputs scaled appropriately?. The jpegs are in [0-255] range and it needs to be scaled to [-1 - 1]. You can try:
image = tf.reshape(image, [128, 64, 1])
image = tf.scalar_mul((1.0/255), image)
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.0)

What is the accuracy you are getting with your model for MNIST? It would be helpful if you post the code. Are you using the trained model to evaluate the output for your own data.
A general suggestion on setting up the convolution model is provided here.
Here is the model suggestion according to the article :-
INPUT -> [[CONV -> RELU]*N -> POOL?]*M -> [FC -> RELU]*K -> FC
Having more than one layers of CONV->RELU pair before pooling improves learning complex features. Try with N=2 instead of 1.
Some other suggestions:
While you are preparing your data reduce it to smaller size than 128x64. Try same size as the MNIST data ..
image = tf.reshape(image, [28, 28, 1])
If your eye/noeye image is color, then convert it to greyscale and normalize the values to unity range. You can do this using numpy or tf, here is how using numpy
grayscale-->
img = np.dot(np.array(img, dtype='float32'), [[0.2989],[0.5870],[0.1140]])
normalize-->
mean = np.mean(img, dtype='float32')
std = np.std(img, dtype='float32', ddof=1)
if std < 1e-4: std = 1.
img = (img - mean) / std

Related

Roi pooling and backpropagation

I have implemented ROI pooling at my graph. The code is as follows.
def __init__(self,fatness,image_shape, vocab, r_vocab, num_classes,rnn_cells_num):
CTCUtils.vocab = vocab
CTCUtils.r_vocab = r_vocab
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.input_labels=tf.placeholder(dtype=tf.string, shape=(config.train.input_labels_size,))
self.input_dat = tf.placeholder(dtype=tf.float32, shape=(None,config.train.extracted_feature_height,config.train.extracted_feature_width,512))
self.in_boxes = tf.placeholder(dtype=tf.float32, shape=(config.train.input_labels_size,5))
self.num_classes = num_classes
self.rnn_cells_num = rnn_cells_num
self.saver = tf.train.Saver()
self.poolheight=1
self.poolwidth=32
self.sess = tf.Session(graph = tf.get_default_graph())
with slim.arg_scope([slim.conv2d, slim.max_pool2d]):
########################################################
#########CONV layers before ROI pooling#################
########################################################
net = slim.repeat(self.input_dat, 4, slim.conv2d, fatness, [3, 3], padding='SAME',scope='conv6',weights_regularizer=slim.l2_regularizer(config.weight_decay),weights_initializer=tf.contrib.layers.xavier_initializer(),biases_initializer = tf.zeros_initializer(),activation_fn=tf.nn.relu)
self.in_boxes=tf.dtypes.cast(self.in_boxes,tf.int32)
########################################################
#######ROI pooling layer################################
########################################################
rec_fmap_clone = roi_pooling(net, self.in_boxes, pool_height=self.poolheight, pool_width=self.poolwidth) #shape is (1, 20, 256, 1, 32)
decision=(rec_fmap_clone.get_shape()==None)
if (decision==False):
self.rec_fmap = tf.identity(rec_fmap_clone)
shape=np.shape(self.rec_fmap)
self.rec_fmap=np.reshape(self.rec_fmap, (shape[1],shape[2],shape[3],shape[4]))
self.rec_fmap=tf.transpose(self.rec_fmap, perm=[0, 2, 3, 1])
else:
self.rec_fmap=tf.ones([config.train.input_labels_size, 1, 32, 256], tf.float32)
with slim.arg_scope([slim.conv2d],normalizer_fn=slim.batch_norm,weights_initializer=tf.truncated_normal_initializer(stddev=0.01),weights_regularizer=slim.l2_regularizer(0.0005)):
classes = slim.conv2d(self.rec_fmap, self.num_classes, [1, 13])
pattern = slim.fully_connected(slim.flatten(classes), self.rnn_cells_num) # patterns number
width = int(self.rec_fmap.get_shape()[2])
pattern = tf.reshape(pattern, (-1, 1, 1, self.rnn_cells_num))
pattern = tf.tile(pattern, [1, 1, width, 1])
inf = tf.concat(axis=3, values=[classes, pattern]) # skip connection over RNN
inf = slim.conv2d(inf, self.num_classes, [1, 1], normalizer_fn=None,activation_fn=None) # fully convolutional linear activation
inf = tf.squeeze(inf, [1])
prob = tf.transpose(inf, (1, 0, 2)) # prepare for CTC
data_length = tf.fill([tf.shape(prob)[1]], tf.shape(prob)[0]) # input seq length, batch size
ctc = tf.py_func(CTCUtils.compute_ctc_from_labels, [self.input_labels], [tf.int64, tf.int64, tf.int64])
ctc_labels = tf.to_int32(tf.SparseTensor(ctc[0], ctc[1], ctc[2]))
predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(prob, data_length, merge_repeated=False, beam_width=10)[0][0])
tf.sparse_tensor_to_dense(predictions, default_value=-1, name='d_predictions')
tf.reduce_mean(tf.edit_distance(predictions, ctc_labels, normalize=False), name='error_rate')
self.loss = tf.reduce_mean(tf.compat.v1.nn.ctc_loss(inputs=prob, labels=ctc_labels, sequence_length=data_length, ctc_merge_repeated=True), name='loss')
self.learning_rate = tf.train.piecewise_constant(self.global_step, [150000, 200000],[config.train.learning_rate, 0.1 * config.train.learning_rate,0.01 * config.train.learning_rate])
self.opt_loss = tf.contrib.layers.optimize_loss(self.loss, self.global_step, self.learning_rate, config.train.opt_type, config.train.grad_noise_scale, name='train_step')
self.sess.run(tf.global_variables_initializer())
The graph has a few convolution layers before ROI pooling and ctc loss is used for optimization.
The concern is whether convolution layers before ROI pooling are optimized in back propagation.
According to discussion here, ROI pooling layer itself is differentiable.
But when the graph is plotted in tensorboard, the graph is disconnected after ROI pooling layer.
How can I check and make sure the conv layers before ROI pooling are update in optimization?
The issue was solved by putting conv layers after RoiPooling.
The first graph was used only for feature extraction using RoiPooling. RoiPooling output size was set bigger dimensions. Then those outputs were used as inputs to the second graph. There conv layers were placed. So that I have weights to optimize.
The modified graph is shown below.

how to get an array of predictions from tensor flow classification model

I have the following classification model.
I would like to get a numpy array similar to y_t which is the test labels one hot encoded. However I keep getting variable error.
# Construct placeholders
with graph.as_default():
inputs_ = tf.placeholder(tf.float32, [None, seq_len, n_channels], name = 'inputs')
labels_ = tf.placeholder(tf.float32, [None, n_classes], name = 'labels')
keep_prob_ = tf.placeholder(tf.float32, name = 'keep')
learning_rate_ = tf.placeholder(tf.float32, name = 'learning_rate')
with graph.as_default():
# (batch, 100, 3) --> (batch, 50, 6)
conv1 = tf.layers.conv1d(inputs=inputs_, filters=6, kernel_size=2, strides=1,
padding='same', activation = tf.nn.relu)
max_pool_1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=2, strides=2, padding='same')
with graph.as_default():
# Flatten and add dropout
flat = tf.reshape(max_pool_1, (-1, 6*6))
flat = tf.nn.dropout(flat, keep_prob=keep_prob_)
# Predictions
logits = tf.layers.dense(flat, n_classes)
# Cost function and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_))
optimizer = tf.train.AdamOptimizer(learning_rate_).minimize(cost)
# Accuracy
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(labels_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy')
Then I use the test set
with tf.Session(graph=graph) as sess:
# Restore
saver.restore(sess, tf.train.latest_checkpoint('bschkpnt-cnn'))
for x_t, y_t in get_batches(X_test, y_test, batch_size):
feed = {inputs_: x_t,
labels_: y_t,
keep_prob_: 1}
batch_acc = sess.run(accuracy, feed_dict=feed)
test_acc.append(batch_acc)
print("Test accuracy: {:.6f}".format(np.mean(test_acc)))
y_t is a nX3 bumpy array.
I want to get a y_pred in similar format
Thanks
soft = tf.nn.softmax(logits)
this will be your probability distribution such that sum(soft) = 1. Every value in this array will indicate how sure the model is about the class.
pred = sess.run(soft, feed_dict=feed)
print(pred)
So basically all I do is place an additional softmax, since you have it inbuilt in the loss you calculate, you've to place it again to predict. Then I ask for the output prediction, and just feed the feed_dict again.
Hope this helped!

ValueError: Cannot feed value of shape (128, 28, 28) for Tensor 'Placeholder:0', which has shape '(?, 784)'

I am new to Tensorflow and Machine Learning and trying out CNN using Tensorflow with my custom input data. But I am getting the error attached below.
The Data or Image Size is 28x28 with 15 Labels.
I am not getting the numpy reshape thing in this script or the error.
Help is highly appreciated.
import tensorflow as tf
import os
import skimage.data
import numpy as np
import random
def load_data(data_directory):
directories = [d for d in os.listdir(data_directory)
if os.path.isdir(os.path.join(data_directory, d))]
labels = []
images = []
for d in directories:
label_directory = os.path.join(data_directory, d)
file_names = [os.path.join(label_directory, f)
for f in os.listdir(label_directory)
if f.endswith(".jpg")]
for f in file_names:
images.append(skimage.data.imread(f))
labels.append(d)
print(str(d)+' Completed')
return images, labels
ROOT_PATH = "H:\Testing\TrainingData"
train_data_directory = os.path.join(ROOT_PATH, "Training")
test_data_directory = os.path.join(ROOT_PATH, "Testing")
print('Loading Data...')
images, labels = load_data(train_data_directory)
print('Data has been Loaded')
n_classes = 15
training_examples = 10500
test_examples = 4500
batch_size = 128
x = tf.placeholder('float', [None, 784])
y = tf.placeholder('float')
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
def neural_network_model(x):
weights = {'W_Conv1':tf.Variable(tf.random_normal([5,5,1,32])),
'W_Conv2':tf.Variable(tf.random_normal([5,5,32,64])),
'W_FC':tf.Variable(tf.random_normal([7*7*64, 1024])),
'Output':tf.Variable(tf.random_normal([1024, n_classes]))}
biases = {'B_Conv1':tf.Variable(tf.random_normal([32])),
'B_Conv2':tf.Variable(tf.random_normal([64])),
'B_FC':tf.Variable(tf.random_normal([1024])),
'Output':tf.Variable(tf.random_normal([n_classes]))}
x = tf.reshape(x, shape=[-1,28,28,1])
conv1 = conv2d(x, weights['W_Conv1'])
conv1 = maxpool2d(conv1)
conv2 = conv2d(conv1, weights['W_Conv2'])
conv2 = maxpool2d(conv2)
fc = tf.reshape(conv2, [-1, 7*7*64])
fc = tf.nn.relu(tf.matmul(fc, weights['W_FC'])+biases['B_FC'])
output = tf.matmul(fc, weights['Output'])+biases['Output']
return output
def next_batch(num, data, labels):
idx = np.arange(0 , len(data))
np.random.shuffle(idx)
idx = idx[:num]
data_shuffle = [data[ i] for i in idx]
labels_shuffle = [labels[ i] for i in idx]
return np.asarray(data_shuffle), np.asarray(labels_shuffle)
def train_neural_network(x):
prediction = neural_network_model(x)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 10
with tf.Session() as sess:
# OLD:
#sess.run(tf.initialize_all_variables())
# NEW:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
for _ in range(int(training_examples/batch_size)):
epoch_x, epoch_y = next_batch(batch_size, images, labels)
_, c = sess.run([optimizer, cost], feed_dict={x: epoch_x, y: epoch_y})
epoch_loss += c
print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x: images, y: labels}))
print('Training Neural Network...')
train_neural_network(x)
What am I doing wrong? What is needed to be fixed and how do I fix the shape of numpy array?
If you look closely, you'll see that you have two x placeholders:
x = tf.placeholder('float', [None, 784]) # global
...
x = tf.reshape(x, shape=[-1,28,28,1]) # in neural_network_model
One of them is in the function scope, hence not visible in train_neural_network, so tensorflow takes the one with [?, 784] shape. You should get rid of one of them.
Also note that your training data has the rank 3, i.e. [batch_size, 28, 28], so it's not directly compatible with any of those placeholders.
To feed it into the first x, take epoch_x.reshape([-1, 784]). For the second placeholder (once you make it visible), take epoch_x.reshape([-1, 28, 28, 1]).

How does the reuse option in tf.variable_scope work?

I have a following problem, I am writing a simple code to learn how tensorflow works and I am defining the variables for convolution with help of tf.variable_scope. However everytime I try to run this script I get a ValueError saying either to set reuse=None or reuse=True.
Can somebody explain why doesn't it just run the function without defining this option or what would be a solution for that?
My code is:
import re
import tensorflow as tf
import numpy as np
data = np.load('/home/joanna/tensorflow-master/tensorflow/models/image/cifar10/konsensop/data.npy')
labels = np.load('/home/joanna/tensorflow-master/tensorflow/models/image/cifar10/konsensop/labels.npy')
labels = np.zeros((16400,))
labels[10001:16400]=1
labels = labels.astype(int)
data = data.astype(np.float32)
#labels = tf.cast(labels,tf.int64)
MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.1 # Initial learning rate.
NUM_CLASSES=2
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN= 1000
batch_size=300
def _variable_on_cpu(name, shape, initializer):
dtype = tf.float32
var = tf.get_variable(name, shape, initializer = initializer, dtype = dtype)
return var
def _add_loss_summaries(loss):
"""Add summaries for losses in CIFAR-10 model.
Generates moving average for all losses and associated summaries for
visualizing the performance of the network.
Args:
total_loss: Total loss from loss().
Returns:
loss_averages_op: op for generating moving averages of losses.
"""
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name +' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
return loss_averages_op
def _variable_with_weight_decay(name, shape, stddev, wd):
dtype = tf.float32
var = _variable_on_cpu(
name,
shape,
tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
def _activation_summary(x):
tensor_name = re.sub('_[0-9]*/','', x.op.name)
tf.histogram_summary(tensor_name + '/activations', x)
tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
def iterate_batches(data, labels, batch_size, num_epochs):
N = int(labels.shape[0])
batches_per_epoch = int(N/batch_size)
for i in range(num_epochs):
for j in range(batches_per_epoch):
start, stop = j*batch_size, (j+1)*batch_size
yield data[start:stop,:,:,:], labels[start:stop]
def train():
with tf.Graph().as_default():
global_step = tf.Variable(0)
x_tensor = tf.placeholder(tf.float32, shape=(batch_size, 3000,1,1))
y_tensor = tf.placeholder(tf.int64, shape=(batch_size,))
for x,y in iterate_batches(data,labels, 300,1):
print('yey!')
with tf.variable_scope('conv1',reuse=True) as scope:
kernel = _variable_with_weight_decay('weights',
shape=[100,1,1,64],
stddev=5e-2,
wd=0.0)
conv = tf.nn.conv2d(x_tensor, kernel, [1,3,1,1], padding = 'SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
bias = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv1)
pool1 = tf.nn.max_pool(conv1, ksize=[1,20,1,1], strides=[1,2,1,1], padding='SAME', name='pool1')
norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm1')
with tf.variable_scope('conv2',reuse=True) as scope:
kernel = _variable_with_weight_decay('weights', [50,1,64,64], stddev = 5e-2, wd=0.0)
conv = tf.nn.conv2d(norm1, kernel, [1,3,1,1], padding='SAME')
biases = _variable_on_cpu('biases',[64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv,biases)
conv2 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv2)
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001/9.0, beta = 0.75, name='norm2')
pool2 = tf.nn.max_pool(norm2, ksize=[1,10,1,1], strides=[1,2,1,1], padding='SAME', name='pool2')
with tf.variable_scope('conv3',reuse=True) as scope:
kernel = _variable_with_weight_decay('weights', [30,1,64,64], stddev = 5e-2, wd=0.0)
conv = tf.nn.conv2d(pool2, kernel, [1,10,1,1], padding='SAME')
biases = _variable_on_cpu('biases',[64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv,biases)
conv3 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv3)
norm3 = tf.nn.lrn(conv3, 4, bias=1.0, alpha=0.001/9.0, beta = 0.75, name='norm3')
pool3 = tf.nn.max_pool(norm3, ksize=[1,9,1,1], strides=[1,9,1,1], padding='SAME', name='pool3')
with tf.variable_scope('fc4',reuse=True) as scope:
# Move everything into depth so we can perform a single matrix multiply.
reshape = tf.reshape(pool3, [batch_size, -1])
dim = reshape.get_shape()[1].value
weights = _variable_with_weight_decay('weights', shape=[dim, 64], stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
fc4 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
_activation_summary(fc4)
with tf.variable_scope('fc5',reuse=True) as scope:
weights = _variable_with_weight_decay('weights', shape=[64, 64],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
fc5 = tf.nn.relu(tf.matmul(fc4, weights) + biases, name=scope.name)
_activation_summary(fc5)
with tf.variable_scope('softmax_linear',) as scope:
weights = _variable_with_weight_decay('weights', [64, NUM_CLASSES],
stddev=1/64.0, wd=0.0)
biases = _variable_on_cpu('biases', [NUM_CLASSES],
tf.constant_initializer(0.0))
softmax_linear = tf.add(tf.matmul(fc5, weights), biases, name=scope.name)
_activation_summary(softmax_linear)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(softmax_linear, y_tensor, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
kupa = tf.add_to_collection('losses', cross_entropy_mean)
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
#neu
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True)
loss_averages_op = _add_loss_summaries(loss)
summary_op = tf.merge_all_summaries()
#neu
init = tf.initialize_all_variables()
sess = tf.Session(config = tf.ConfigProto(log_device_placement=False))
sess.run(init)
sess.run([conv, bias, conv1, pool1, norm1, conv2,norm2, pool2, conv3, norm3, pool3,fc4,fc5], feed_dict={x_tensor:x, y_tensor:y})
sess.run([softmax_linear,loss], feed_dict={x_tensor:x, y_tensor:y})
sess.run([lr, loss_averages_op, summary_op], feed_dict={x_tensor:x, y_tensor:y})
The problem is with this line here:
for x,y in iterate_batches(data,labels, 300,1):
This will recreate the graph however many times which is a bad thing to do as it'll take up more memory each time (this isn't always the case but it can happen).
The reuse=True comes in something like this example below when defining the graph.
# First call creates one set of variables.
result1 = my_image_filter(image1)
# Another set of variables is created in the second call.
result2 = my_image_filter(image2)
Tensorflow doesn't know whether or not you want to "reuse" the variables as in should they share the same parameters or not.
In your specific case by looping your recreating the parameters each time and telling tensorflow to simply reuse the variables.
It would be better if you could move the for loop to after the graph creation has already occurred and then you could get rid of the reuse=True everywhere.

Tensorflow training got stuck after some steps, how to investigate?

I have a python script to train a Tensorflow model similar to the one in CIFAR-10 tutorial. I have 20500 training examples and am using 128 examples per batch. I set 1,000,000 as the max number of steps. However after about 164,000 steps, the python script seems stuck somewhere. Is there any way to find out where the script is stuck? My last resort would be using Ctrl-C to terminate the process and force it to print out a backtrace. But I wonder if there are other things I should check before I kill the process.
Here's the train loop:
def train(trainingData, batchSize, workingDir, maxSteps):
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
image, label = readData(trainingData)
minAfterDequeue = 5000
capacity = minAfterDequeue + 3 * batchSize
imageBatch, labelBatch = tf.train.shuffle_batch([image, label], batch_size=batchSize, capacity=capacity, min_after_dequeue=minAfterDequeue)
#labelBatch = tf.reshape(labelBatch, [batchSize, 1])
#tf.image_summary('images', imageBatch)
#tf.histogram_summary('labels', tf.cast(labelBatch, tf.float32))
logits = network.inference(imageBatch, 0.5)
#floatLabel = tf.cast(labelBatch, tf.float32)
#cross_entropy_per_example = tf.nn.softmax_cross_entropy_with_logits(logits, floatLabel)
loss, cross_entropy = network.loss(logits, labelBatch)
train_op = network.train(loss, global_step, batchSize)
# Create a saver
saver = tf.train.Saver(tf.all_variables())
summary_op = tf.merge_all_summaries()
session = tf.Session()
init = tf.initialize_all_variables()
session.run(init)
tf.train.start_queue_runners(sess=session)
summary_writer = tf.train.SummaryWriter(workingDir, session.graph_def)
for step in xrange(maxSteps):
start_time = time.time()
#l, sm, ce = session.run([floatLabel, logits, cross_entropy_per_example])
#print l
#print sm
#print ce
_, loss_value = session.run([train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
examples_per_sec = batchSize / duration
format_str = "%s: step %d, loss = %e (%.1f examples/sec; %.3f sec/batch"
print (format_str % (datetime.now(), step, loss_value, examples_per_sec, float(duration)))
if step % 100 == 0:
summary_str = session.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 1000 == 0 or (step + 1) == maxSteps:
checkpoint_path = os.path.join(workingDir, 'model.ckpt')
saver.save(session, checkpoint_path, global_step = step)
And here's the various functions used to construct the graph:
import re
import tensorflow as tf
TOWER_NAME="tower"
NUM_EXAMPLES_PER_EPOCH = 50000
# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.95 # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.01 # Initial learning rate.
def _activation_summary(x):
"""Helper to create summaries for activations.
Creates a summary that provides a histogram of activations.
Creates a summary that measure the sparsity of activations.
Args:
x: Tensor
Returns:
nothing
"""
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
tf.histogram_summary(tensor_name + '/activations', x)
tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
#numChannel = tf.shape(x)[3]
#tf.image_summary(tensor_name + '/image', tf.reshape(x)
def _variable_on_cpu(name, shape, initializer):
"""Helper to create a Variable stored on CPU memory.
Args:
name: name of the variable
shape: list of ints
initializer: initializer for Variable
Returns:
Variable Tensor
"""
with tf.device('/cpu:0'):
var = tf.get_variable(name, shape, initializer=initializer, dtype=tf.float32)
return var
def _variable_with_weight_decay(name, shape, stddev, wd=None):
"""Helper to create an initialized Variable with weight decay.
Note that the Variable is initialized with a truncated normal distribution.
A weight decay is added only if one is specified.
Args:
name: name of the variable
shape: list of ints
stddev: standard deviation of a truncated Gaussian
wd: add L2Loss weight decay multiplied by this float. If None, weight
decay is not added for this Variable.
Returns:
Variable Tensor
"""
var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev))
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
def inference(images, dropout):
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights', shape=[5, 5, 1, 32], stddev=5e-2)
conv = tf.nn.conv2d(images, kernel, [1,1,1,1], padding='SAME')
biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv1)
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1')
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights', shape=[3, 3, 32, 64], stddev=5e-2)
conv = tf.nn.conv2d(pool1, kernel, [1,1,1,1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv2)
# pool2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2')
# conv3
with tf.variable_scope('conv3') as scope:
kernel = _variable_with_weight_decay('weights', shape=[3, 3, 64, 64], stddev=5e-2)
conv = tf.nn.conv2d(pool2, kernel, [1,1,1,1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
bias = tf.nn.bias_add(conv, biases)
conv3 = tf.nn.relu(bias, name=scope.name)
_activation_summary(conv3)
# pool 3
pool3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool3')
# fully connected 4
with tf.variable_scope('full4') as scope:
batchSize = pool3.get_shape()[0].value
flattened = tf.reshape(pool3, [batchSize, -1])
dim = flattened.get_shape()[1].value
weights = _variable_with_weight_decay('weights', shape=[dim, 256], stddev=5e-2)
biases = _variable_on_cpu('biases', [256], tf.constant_initializer(0.1))
full4 = tf.nn.relu(tf.matmul(flattened, weights) + biases, name=scope.name)
full4_dropout = tf.nn.dropout(full4, dropout)
_activation_summary(full4)
#_activation_summary(full4_dropout)
# fully connected 5
with tf.variable_scope('full5') as scope:
weights = _variable_with_weight_decay('weights', [256, 128], stddev=5e-2)
biases = _variable_on_cpu('biases', [128], tf.constant_initializer(0.1))
full5 = tf.nn.relu(tf.matmul(full4_dropout, weights) + biases, name=scope.name)
full5_dropout = tf.nn.dropout(full5, dropout)
_activation_summary(full5)
#_activation_summary(full5_dropout)
# softmax
with tf.variable_scope('softmax_linear') as scope:
weights = _variable_with_weight_decay('weights', [128, 2], stddev=1/128.0)
biases = _variable_on_cpu('biases', [2], tf.constant_initializer(0.0))
softmax_linear = tf.add(tf.matmul(full5_dropout, weights), biases, name=scope.name)
_activation_summary(softmax_linear)
return softmax_linear
def loss(logits, labels):
labels = tf.cast(labels, tf.float32)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
return tf.add_n(tf.get_collection('losses'), name='total_loss'), cross_entropy_mean
def _add_loss_summaries(total_loss):
"""Add summaries for losses in CIFAR-10 model.
Generates moving average for all losses and associated summaries for
visualizing the performance of the network.
Args:
total_loss: Total loss from loss().
Returns:
loss_averages_op: op for generating moving averages of losses.
"""
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name +' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
return loss_averages_op
def train(loss, step, batchSize):
numBatchesPerEpoch = NUM_EXAMPLES_PER_EPOCH / batchSize
decay_steps = int(numBatchesPerEpoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
tf.scalar_summary('learning_rate', lr)
loss_averages_op = _add_loss_summaries(loss)
# compute gradients
with tf.control_dependencies([loss_averages_op]):
opt = tf.train.GradientDescentOptimizer(lr)
grads = opt.compute_gradients(loss)
# apply gradients
apply_gradient_op = opt.apply_gradients(grads, global_step = step)
# add histograms for trainable variables
for var in tf.trainable_variables():
tf.histogram_summary(var.op.name, var)
# add histograms for gradients:
for grad, var in grads:
if grad is not None:
tf.histogram_summary(var.op.name + '/gradients', grad)
# Track the moving average of all trainable variables
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, step)
variable_averages_op = variable_averages.apply(tf.trainable_variables())
with tf.control_dependencies([apply_gradient_op, variable_averages_op]):
train_op = tf.no_op(name='train')
return train_op