I am trying to train a dataset of 10,000+ images using Tensorflow GPU (GTX 1060 Max-Q 6GB). Because the size of the image in my dataset is huge (512 x 424 pixels) I get a MemoryError.
Traceback (most recent call last):
File "train.py", line 33, in <module>
data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)
File "/home/nabeel/tf-realsense-gesture/dataset.py", line 103, in read_train_sets
images, labels, img_names, cls = shuffle(images, labels, img_names, cls)
File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 403, in shuffle
return resample(*arrays, **options)
File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 327, in resample
resampled_arrays = [safe_indexing(a, indices) for a in arrays]
File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 216, in safe_indexing
return X.take(indices, axis=0)
MemoryError
The problem with my code is that I am training all seven classes at the same time which is why I get a memory error. I want to process single class at a time.
I have tried to implement a while/for loop inside but every time a loop finishes, the .meta file is overwritten and only works on one class. Is there any way to train multiple classes at the time or one by one?
train.py
batch_size = 1
# 7 classess for recognitions
#classes = ['up']
classes = ['up','down','left','right','forward','backward','none']
#classes = ['up','down','left','right','forward','backward','none']
num_classes = len(classes)
# 20% of the data will automatically be used for validation
validation_size = 0.2
img_size = 200
num_channels = 3
train_path='training_data'
# load all the training and validation images and labels into memory
data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)
print("Complete reading input data. Will Now print a snippet of it")
print("Number of files in Training-set:\t\t{}".format(len(data.train.labels)))
print("Number of files in Validation-set:\t{}".format(len(data.valid.labels)))
session = tf.Session()
x = tf.placeholder(tf.float32, shape=[batch_size,img_size,img_size,num_channels], name='x')
# labels
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)
#Network graph params
filter_size_conv1 = 3
num_filters_conv1 = 32
filter_size_conv2 = 3
num_filters_conv2 = 32
filter_size_conv3 = 3
num_filters_conv3 = 64
filter_size_conv4 = 3
num_filters_conv4 = 128
filter_size_conv5 = 3
num_filters_conv5 = 256
filter_size_conv6 = 3
num_filters_conv6 = 512
filter_size_conv7 = 3
num_filters_conv7= 1024
fc_layer_size = 2048
def create_weights(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
def create_biases(size):
return tf.Variable(tf.constant(0.05, shape=[size]))
def create_convolutional_layer(input,num_input_channels,conv_filter_size,num_filters):
# define the weights that will be trained
weights = create_weights(shape=[conv_filter_size, conv_filter_size, num_input_channels, num_filters])
# create biases
biases = create_biases(num_filters)
# Creat convolutional layer
layer = tf.nn.conv2d(input=input,filter=weights,strides=[1, 1, 1, 1],padding='SAME')
layer += biases
# max-pooling
layer = tf.nn.max_pool(value=layer,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME')
# Relu is the activation function
layer = tf.nn.relu(layer)
return layer
def create_flatten_layer(layer):
layer_shape = layer.get_shape()
num_features = layer_shape[1:4].num_elements()
# Flatten the layer so reshape to num_features
layer = tf.reshape(layer, [-1, num_features])
return layer
def create_fc_layer(input,
num_inputs,
num_outputs,
use_relu=True):
# define trainable weights and biases.
weights = create_weights(shape=[num_inputs, num_outputs])
biases = create_biases(num_outputs)
# Fully connected layer
layer = tf.matmul(input, weights) + biases
if use_relu:
layer = tf.nn.relu(layer)
return layer
layer_conv1 = create_convolutional_layer(input=x,num_input_channels=num_channels,conv_filter_size=filter_size_conv1,
num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
num_input_channels=num_filters_conv1,
conv_filter_size=filter_size_conv2,
num_filters=num_filters_conv2)
layer_conv3= create_convolutional_layer(input=layer_conv2,
num_input_channels=num_filters_conv2,
conv_filter_size=filter_size_conv3,
num_filters=num_filters_conv3)
layer_conv4= create_convolutional_layer(input=layer_conv3,
num_input_channels=num_filters_conv3,
conv_filter_size=filter_size_conv4,
num_filters=num_filters_conv4)
layer_conv5= create_convolutional_layer(input=layer_conv4,
num_input_channels=num_filters_conv4,
conv_filter_size=filter_size_conv5,
num_filters=num_filters_conv5)
layer_conv6= create_convolutional_layer(input=layer_conv5,
num_input_channels=num_filters_conv5,
conv_filter_size=filter_size_conv6,
num_filters=num_filters_conv6)
layer_conv7= create_convolutional_layer(input=layer_conv6,
num_input_channels=num_filters_conv6,
conv_filter_size=filter_size_conv7,
num_filters=num_filters_conv7)
layer_flat = create_flatten_layer(layer_conv7)
layer_fc1 = create_fc_layer(input=layer_flat,num_inputs=layer_flat.get_shape()[1:4].num_elements(),num_outputs=fc_layer_size,
use_relu=True)
layer_fc2 = create_fc_layer(input=layer_fc1, num_inputs=fc_layer_size,num_outputs=num_classes, use_relu=False)
y_pred = tf.nn.softmax(layer_fc2,name='y_pred')
y_pred_cls = tf.argmax(y_pred, dimension=1)
session.run(tf.global_variables_initializer())
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,labels=y_true)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
session.run(tf.global_variables_initializer())
def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
acc = session.run(accuracy, feed_dict=feed_dict_train)
val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
msg = "Training Epoch {0} --- Training Accuracy: {1:>6.1%}, Validation Accuracy: {2:>6.1%}, Validation Loss: {3:.3f}"
print(msg.format(epoch + 1, acc, val_acc, val_loss))
total_iterations = 0
saver = tf.train.Saver()
def train(num_iteration):
global total_iterations
for i in range(total_iterations,total_iterations + num_iteration):
x_batch, y_true_batch, _, cls_batch = data.train.next_batch(batch_size)
x_valid_batch, y_valid_batch, _, valid_cls_batch = data.valid.next_batch(batch_size)
feed_dict_tr = {x: x_batch,y_true: y_true_batch}
feed_dict_val = {x: x_valid_batch,y_true: y_valid_batch}
session.run(optimizer, feed_dict=feed_dict_tr)
if i % int(data.train.num_examples/batch_size) == 0:
val_loss = session.run(cost, feed_dict=feed_dict_val)
epoch = int(i / int(data.train.num_examples/batch_size))
show_progress(epoch, feed_dict_tr, feed_dict_val, val_loss)
saver.save(session, '/home/nabeel/tf-realsense-gesture/')
total_iterations += num_iteration
train(num_iteration=6000)
Since you are Facing Out Of Memory Issue in CNNs, you can try the below steps:
Increase the Strides of the Convolutional Layer i.e., instead of using Sh = 1 and Sw = 1, you can use either Sh = 2 and Sw = 2. This will reduce the Dimensionality of the Image and hence will reduce the RAM Consumption. Code for the same is shown below:
layer = tf.nn.conv2d(input=input,filter=weights,strides=[1, 2, 2, 1],padding='SAME')
Verify if you really require 7 Convolutional Layers. You can try with Less Number of Convolutional Layers (4 or 5 or 6) and can check the performance. Because each Convolutional Layer with some Number of Filters will increase the Memory usage.
Replace tf.float32 with tf.float16 and if it works without any error.
Using an Inception Module instead of Convolutional Layer.
Related
I am using this example of a VAE.
The only difference I made was change the loss from binary cross entropy to MSE, like this:
class OptimizerVAE(object):
def __init__(self, model, learning_rate=1e-3):
"""
OptimizerVAE initializer
:param model: a model object
:param learning_rate: float, learning rate of the optimizer
"""
# binary cross entropy error
self.bce = tf.keras.losses.mse(model.x, model.logits)
self.reconstruction_loss = tf.reduce_mean(tf.reduce_sum(self.bce, axis=-1))
if model.distribution == 'normal':
# KL divergence between normal approximate posterior and standard normal prior
self.p_z = tf.distributions.Normal(tf.zeros_like(model.z), tf.ones_like(model.z))
kl = model.q_z.kl_divergence(self.p_z)
self.kl = tf.reduce_mean(tf.reduce_sum(kl, axis=-1))*0.1
elif model.distribution == 'vmf':
# KL divergence between vMF approximate posterior and uniform hyper-spherical prior
self.p_z = HypersphericalUniform(model.z_dim - 1, dtype=model.x.dtype)
kl = model.q_z.kl_divergence(self.p_z)
self.kl = tf.reduce_mean(kl)*0.1
else:
raise NotImplemented
self.ELBO = - self.reconstruction_loss - self.kl
self.train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(-self.ELBO)
self.print = {'recon loss': self.reconstruction_loss, 'ELBO': self.ELBO, 'KL': self.kl}
and when running the original architecture, the model runs perfectly (2 MLP layers), no matter the size of the batches (specified as "None" in the github code).
I am trying to change this to a convolutional model, but when I change just the encoder to this:
def _encoder(self, x):
"""
Encoder network
:param x: placeholder for input
:return: tuple `(z_mean, z_var)` with mean and concentration around the mean
"""
# 2 hidden layers encoder
#h0 = tf.layers.dense(x, units=self.h_dim * 2, activation=self.activation)
#h1 = tf.layers.dense(h0, units=self.h_dim, activation=self.activation)
h1 = tf.layers.conv1d(x, filters = 32, kernel_size = 7, activation = tf.nn.relu)
h1 = tf.layers.conv1d(h1, filters = 64, kernel_size = 7, activation =tf.nn.relu)
h1 = tf.layers.conv1d(h1, filters = 64, kernel_size = 7, activation = tf.nn.relu)
h1 = tf.layers.flatten(h1)
h1 = tf.layers.dense(h1, 32, activation = tf.nn.relu)
if self.distribution == 'normal':
# compute mean and std of the normal distribution
z_mean = tf.layers.dense(h1, units=self.z_dim, activation=None, name = 'z_output')
z_var = tf.layers.dense(h1, units=self.z_dim, activation=tf.nn.softplus)
elif self.distribution == 'vmf':
# compute mean and concentration of the von Mises-Fisher
z_mean = tf.layers.dense(h1, units=self.z_dim, activation=lambda x: tf.nn.l2_normalize(x, axis=-1))
# the `+ 1` prevent collapsing behaviors
z_var = tf.layers.dense(h1, units=1, activation=tf.nn.softplus) + 1
else:
raise NotImplemented
return z_mean, z_var
and when running the model, I get the error:
InvalidArgumentError: Incompatible shapes: [32,1] vs. [32,512,1]
[[{{node gradients/SquaredDifference_grad/BroadcastGradientArgs}}]]
32 is the batch_size when running the model. The thing that is confusing me is when I run this with batch_size = 1, the model runs!
Where is this going wrong? is it the optimizer and the way it averages?
I solved the issue by reshaping the output from the decoder in form: (win_size, 1), since the MLP fails to add that extra dim'n in!
I've been walking through some tensorflow tutorials and am cobbling together a pet experiment. However, I am running into some dimension errors and I can seem to figure them out.
My goal: I have an input matrix for the shape 1xN. I have a training set of dimension 10xN. (1 and 10 were chosen arbitrarily). N is intended to represent N samples in a training set: 1 input value mapped to one vector of outputs. You can think of this as 1 input neuron and m output neurons. The training set is a set of these single values mapped to a 1d vector. I wish to train the network by running the set of these mapped inputs and outputs against it and reducing the error.
The simple algorithm that I am trying to accomplish:
For each value in the input vector
Load the input neuron with that value
Feed forward
Evaluate against the corresponding vector
Repeat to minimize error.
However, I seem to be getting mixed up with how to format the data to feed to the network. I have a placeholder of 1 input neurons and one of n output neurons. I want to follow the above algorithm but I am not sure if I am doing it right:
# Data parameters
num_frames = 10
stimuli_value_low = .00001
stimuli_value_high = 100
pixel_value_low = .00001
pixel_value_high = 256.0
stimuli_dimension = 1
frame_dimension = 10
stimuli = np.random.uniform(stimuli_value_low, stimuli_value_high, (stimuli_dimension, num_frames))
frames = np.random.uniform(pixel_value_low, pixel_value_high, (frame_dimension, num_frames))
# Parameters
learning_rate = 0.01
training_iterations = 1000
display_iteration = 10
# Network Parameters
n_hidden_1 = 100
n_hidden_2 = 100
num_input_neurons = stimuli_dimension
num_output_neurons = frame_dimension
# Create placeholders
input_placeholder = tf.placeholder("float", [None, num_input_neurons])
output_placeholder = tf.placeholder("float", [None, num_output_neurons])
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([num_input_neurons, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, num_output_neurons]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([num_output_neurons]))
}
# Create model
def neural_net(input_placeholder):
# Hidden fully connected layer
layer_1 = tf.add(tf.matmul(input_placeholder, weights['h1']), biases['b1'])
# Hidden fully connected layer
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
# Output fully connected layer with a neuron for each pixel
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Construct model
logits = neural_net(input_placeholder)
# Define loss operation and optimizer
loss_operation = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = output_placeholder))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
train_operation = optimizer.minimize(loss_operation)
# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(output_placeholder, 1))
accuracy_operation = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
# Start Training
with tf.Session() as sess:
# Run the initializer
sess.run(init)
for step in range(1, training_iterations + 1):
sess.run(train_operation, feed_dict = {X: stimuli, Y: frames})
if iteration % display_iteration == 0 or iteration == 1:
loss, accuracy = sess.run([loss_operation, accuracy_operation], feed_dict = {X: stimuli, Y: frames})
print("Step " + str(iteration) +
", Loss = " + "{:.4f}".format(loss) +
", Training Accuracy= " + \
"{:.3f}".format(acc))
print("Optimization finished!")
I think it is something to do with how I am structuring my data or feeding it to the run function.
Here is the error I am getting:
ValueError Traceback (most recent call last)
<ipython-input-420-7517598734d6> in <module>()
6 for step in range(1, training_iterations + 1):
7
----> 8 sess.run(train_operation, feed_dict = {X: stimuli, Y: frames})
9
10 if iteration % display_iteration == 0 or iteration == 1:
1 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1147 'which has shape %r' %
1148 (np_val.shape, subfeed_t.name,
-> 1149 str(subfeed_t.get_shape())))
1150 if not self.graph.is_feedable(subfeed_t):
1151 raise ValueError('Tensor %s may not be fed.' % subfeed_t)
ValueError: Cannot feed value of shape (1, 10) for Tensor 'Placeholder_6:0', which has shape '(?, 1)'
How can I ensure I am formatting my input data correctly and forming my network corresponingly?
Turns out I had the dimensions of the arrays I was generating backwards:
stimuli = np.random.uniform(stimuli_value_low, stimuli_value_high, (stimuli_dimension, num_frames))
frames = np.random.uniform(pixel_value_low, pixel_value_high, (frame_dimension, num_frames))
should be:
stimuli = np.random.uniform(stimuli_value_low, stimuli_value_high, (num_frames, stimuli_dimension))
frames = np.random.uniform(pixel_value_low, pixel_value_high, (num_frames, frame_dimension))
I am trying to create an end-to-end trainable offline English Handwriting Recognition Model (without segmenting individual character). I am using the word dataset from IAM Handwriting Database for training.
I tried decreasing the learning rate, increasing batch size, etc. but the loss keeps on fluctuating with no/significant overall decrease - TensorBoard visualization for cost at each step
I am new to TensorFlow so could have made some naive error. The code used:
class CRNN(object):
def __init__(self, config):
self.config = config
tf.reset_default_graph()
def read_and_decode(self, filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
# Define how to parse the example
context_features = {
'length': tf.FixedLenFeature([], dtype=tf.int64),
'out_length': tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
'token': tf.FixedLenSequenceFeature([], dtype=tf.float32),
'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=serialized_example,
context_features=context_features,
sequence_features=sequence_features)
image = sequence_parsed['token']
label = tf.cast(sequence_parsed['labels'], tf.int32)
length = tf.cast(context_parsed['length'], tf.int32)
lab_length = tf.cast(context_parsed['out_length'], tf.int32)
image_shape = tf.cast(tf.stack([self.config.im_height,
length/self.config.im_height]), tf.int32)
image = tf.reshape(image, image_shape)
# Updating length to represent image width
length = tf.shape(image)[1]
# Batch the variable length tensor with dynamic padding
self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
tensors=[image, label, length, lab_length],
batch_size=self.config.batch_size, dynamic_pad=True)
def net(self):
batch_lab_length = tf.reduce_max(self.lab_lengths)
batch_im_length = tf.reduce_max(self.lengths)
# Reshape to time major
sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size,
self.config.im_height])
# Feed sequences into RNN
with tf.name_scope('RNN'):
self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden,
state_is_tuple=True)
self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=self.cell_fw,
cell_bw=self.cell_bw,
inputs=sequences,
dtype=tf.float32,
sequence_length=self.lengths,
time_major=True,
scope='RNN'
)
# Reshaping to apply the same weights over the timesteps
self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden])
self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden,
self.config.num_classes],
stddev=0.1), name='out_W')
self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b')
# Doing the affine projection
logits = tf.matmul(self.output, self.out_W) + self.out_b
# Reshaping back to the original shape
logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes])
# Time major
logits = tf.transpose(logits, (1, 0, 2))
# Training computation
# Prepare sparse tensor for CTC loss
labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length))
sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs))
labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size])
mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool)
labels_vals = tf.boolean_mask(labels_vals,mask)
labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals,
dense_shape=[self.config.batch_size,
tf.cast(batch_lab_length, tf.int64)])
self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths,
preprocess_collapse_repeated=False, ctc_merge_repeated=False,
time_major=True)
self.cost = tf.reduce_mean(self.loss)
# Optimizer
self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,
momentum=0.9, use_nesterov=True).minimize(self.cost)
# Predictions for the training, validation, and test data.
self.train_prediction = tf.nn.ctc_beam_search_decoder(logits,
sequence_length=self.lab_lengths)
def train(self):
num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size)
tf.reset_default_graph()
filename_queue = tf.train.string_input_producer(
[self.config.tfrecord_filename], num_epochs=self.config.num_epochs)
self.read_and_decode(filename_queue)
self.net()
# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
saver = tf.train.Saver()
with tf.Session() as sess:
training_summary = tf.summary.scalar("training_cost", self.cost)
writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph)
sess.run(init_op)
print('Initialized')
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start = time.time()
steps_time = start
epoch = 1
for step in range(num_steps):
_, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost,
self.train_prediction,
self.labels, training_summary])
writer.add_summary(train_summ, step)
if (step % 10000 == 0):
preds = np.zeros((predictions[0][0].dense_shape))
i = 0
for idx in predictions[0][0].indices:
preds[idx[0]][idx[1]] = predictions[0][0].values[i]
i+=1
print(time.time() - steps_time)
steps_time = time.time()
print('Minibatch cost at step %d: %f' % (step, c))
print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels],
'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds])
if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0):
print('Epoch', epoch, 'Completed')
epoch+=1
last_step = step
saver.save(sess, "model_BLSTM", global_step=last_step)
writer.close()
print(time.time() - start)
After trying a lot of things unsuccessfully, I found that an incorrect argument was provided to the sequence_length argument of tf.nn.ctc_loss. It should be set to 'length of input sequence' but I had set it to 'length of output sequence(labels - number of character)'
More details can be found in comments under the selected answer to this question - CTC Loss InvalidArgumentError: sequence_length(b) <= time
Also, if one has a GPU it would be better to use Baidu's CTC GPU implementation (https://github.com/baidu-research/warp-ctc) as it can speed up the training a lot.
The problem is that you are feeding raw images in the LSTM, so it is very difficult for it to extract any useful information. The CRNN paper first uses a series of convolutional layers to extract features from the images, and then these are fed into the LSTM.
I apologize that I'm not good at English.
I'm trying to build my own Fully Convolutional Network using TensorFlow.
But I have difficulties on training this model with my own image data, whereas the MNIST data worked properly.
Here is my FCN model code: (Not using pre-trained or pre-bulit model)
import tensorflow as tf
import numpy as np
Loading MNIST Data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
images_flatten = tf.placeholder(tf.float32, shape=[None, 784])
images = tf.reshape(images_flatten, [-1,28,28,1]) # CNN deals with 3 dimensions
labels = tf.placeholder(tf.float32, shape=[None, 10])
keep_prob = tf.placeholder(tf.float32) # Dropout Ratio
Convolutional Layers
# Conv. Layer #1
W1 = tf.Variable(tf.truncated_normal([3, 3, 1, 4], stddev = 0.1))
b1 = tf.Variable(tf.truncated_normal([4], stddev = 0.1))
FMA = tf.nn.conv2d(images, W1, strides=[1,1,1,1], padding='SAME')
# FMA stands for Fused Multiply Add, which means convolution
RELU = tf.nn.relu(tf.add(FMA, b1))
POOL = tf.nn.max_pool(RELU, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
# Conv. Layer #2
W2 = tf.Variable(tf.truncated_normal([3, 3, 4, 8], stddev = 0.1))
b2 = tf.Variable(tf.truncated_normal([8], stddev = 0.1))
FMA = tf.nn.conv2d(POOL, W2, strides=[1,1,1,1], padding='SAME')
RELU = tf.nn.relu(tf.add(FMA, b2))
POOL = tf.nn.max_pool(RELU, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
# Conv. Layer #3
W3 = tf.Variable(tf.truncated_normal([7, 7, 8, 16], stddev = 0.1))
b3 = tf.Variable(tf.truncated_normal([16], stddev = 0.1))
FMA = tf.nn.conv2d(POOL, W3, strides=[1,1,1,1], padding='VALID')
RELU = tf.nn.relu(tf.add(FMA, b3))
# Dropout
Dropout = tf.nn.dropout(RELU, keep_prob)
# Conv. Layer #4
W4 = tf.Variable(tf.truncated_normal([1, 1, 16, 10], stddev = 0.1))
b4 = tf.Variable(tf.truncated_normal([10], stddev = 0.1))
FMA = tf.nn.conv2d(Dropout, W4, strides=[1,1,1,1], padding='SAME')
LAST_RELU = tf.nn.relu(tf.add(FMA, b4))
Summary: [Conv-ReLU-Pool] - [Conv-ReLU-Pool] - [Conv-ReLU] - [Dropout] - [Conv-ReLU]
Define Loss, Accuracy
prediction = tf.squeeze(LAST_RELU)
# Because FCN returns (1 x 1 x class_num) in training
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction, labels))
# First arg is 'logits=' and the other one is 'labels='
optimizer = tf.train.AdamOptimizer(0.001)
train = optimizer.minimize(loss)
label_max = tf.argmax(labels, 1)
pred_max = tf.argmax(prediction, 1)
correct_pred = tf.equal(pred_max, label_max)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
Training Model
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(10000):
image_batch, label_batch = mnist.train.next_batch(100)
sess.run(train, feed_dict={images: image_batch, labels: label_batch, keep_prob: 0.8})
if i % 10 == 0:
tr = sess.run([loss, accuracy], feed_dict={images: image_batch, labels: label_batch, keep_prob: 1.0})
print("Step %d, Loss %g, Accuracy %g" % (i, tr[0], tr[1]))
Loss: 0.784 (Approximately)
Accuracy: 94.8% (Approximately)
The problem is that, training this model with MNIST data worked very well, but with my own data, loss is always same(0.6319), and the output layer is always 0.
There is no difference with the code, excepting for the third convolutional layer's filter size. This filter size and input size which is compressed by previous pooling layers, must have same width & height. That's why the filter size in this layer is [7,7].
What is wrong with my model?..
The only different code between two cases (MNIST, my own data) is:
Placeholder
My own data has (128 x 64 x 1) and the label is 'eyes', 'not_eyes'
images = tf.placeholder(tf.float32, [None, 128, 64, 1])
labels = tf.placeholder(tf.int32, [None, 2])
3rd Convolutional Layer
W3 = tf.Variable(tf.truncated_normal([32, 16, 8, 16], stddev = 0.1))
Feeding (Batch)
image_data, label_data = input_data.get_batch(TRAINING_FILE, 10)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
for i in range(10000):
image_batch, label_batch = sess.run([image_data, label_data])
sess.run(train, feed_dict={images: image_batch, labels: label_batch, keep_prob: 0.8})
if i % 10 == 0: ... # Validation part is almost same, too...
coord.request_stop()
coord.join(threads)
Here "input_data" is an another python file in the same directory, and "get_batch(TRAINING_FILE, 10)" is the function that returns batch data. The code is:
def get_input_queue(txtfile_name):
images = []
labels = []
for line in open(txtfile_name, 'r'): # Here txt file has data's path, label, label number
cols = re.split(',|\n', line)
labels.append(int(cols[2]))
images.append(tf.image.decode_jpeg(tf.read_file(cols[0]), channels = 1))
input_queue = tf.train.slice_input_producer([images, labels], shuffle = True)
return input_queue
def get_batch(txtfile_name, batch_size):
input_queue = get_input_queue(txtfile_name)
image = input_queue[0]
label = input_queue[1]
image = tf.reshape(image, [128, 64, 1])
batch_image, batch_label = tf.train.batch([image, label], batch_size)
batch_label_one_hot = tf.one_hot(tf.to_int64(batch_label), 2, on_value=1.0, off_value=0.0)
return batch_image, batch_label_one_hot
It seems not to have any problem .... :( Please Help me..!!
Are your inputs scaled appropriately?. The jpegs are in [0-255] range and it needs to be scaled to [-1 - 1]. You can try:
image = tf.reshape(image, [128, 64, 1])
image = tf.scalar_mul((1.0/255), image)
image = tf.subtract(image, 0.5)
image = tf.multiply(image, 2.0)
What is the accuracy you are getting with your model for MNIST? It would be helpful if you post the code. Are you using the trained model to evaluate the output for your own data.
A general suggestion on setting up the convolution model is provided here.
Here is the model suggestion according to the article :-
INPUT -> [[CONV -> RELU]*N -> POOL?]*M -> [FC -> RELU]*K -> FC
Having more than one layers of CONV->RELU pair before pooling improves learning complex features. Try with N=2 instead of 1.
Some other suggestions:
While you are preparing your data reduce it to smaller size than 128x64. Try same size as the MNIST data ..
image = tf.reshape(image, [28, 28, 1])
If your eye/noeye image is color, then convert it to greyscale and normalize the values to unity range. You can do this using numpy or tf, here is how using numpy
grayscale-->
img = np.dot(np.array(img, dtype='float32'), [[0.2989],[0.5870],[0.1140]])
normalize-->
mean = np.mean(img, dtype='float32')
std = np.std(img, dtype='float32', ddof=1)
if std < 1e-4: std = 1.
img = (img - mean) / std
I am trying to build a linear classifier with CIFAR - 100 using TensorFlow. I got the code from Martin Gorner's MNIST tutorial and change a bit. When I run this code, tensorflow does not training (code is running but accuracy remains 1.0 and loss(cross entropy remains as 4605.17), I don't know what is wrong, I am actually newbie to TF any help is appreciated.
import pickle
import numpy as np
import os
import tensorflow as tf
from tensorflow.python.framework import tensor_util
import math
#imports data
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
return dict
cifar100_test = {}
cifar100_train = {}
labelMap = {}
labelNames = {}
# Load the raw CIFAR-10 data.
cifar100_test = unpickle('dataset/cifar-100-python/test')
cifar100_train = unpickle('dataset/cifar-100-python/train')
labelMap = unpickle('dataset/cifar-100-python/meta')
#tr for training data and te for testing data, X is data, Y is label
Xtr = cifar100_train[b'data']
Yr = cifar100_train[b'fine_labels']
Xte = cifar100_test[b'data']
Ye = cifar100_test[b'fine_labels']
classNames = labelMap[b'fine_label_names']
num_train = Xtr.shape[0]
num_test = Xte.shape[0]
num_class = len(classNames)
Ytr = np.zeros([num_train, num_class])
Yte = np.zeros([num_test, num_class])
Ytr[0:num_train, Yr[0:num_train]] = 1
Yte[0:num_test, Ye[0:num_test]] = 1
# As a sanity check, we print out the size of the training and test data.
print('Train data shape:', Xtr.shape)
print('Train Label shape:', Ytr.shape)
print('Test data shape:', Xte.shape)
print('Test Label shape:', Yte.shape)
print('Name of Predicted Class:', classNames[0]) #indice of the label name is the indice of the class.
Xtrain = Xtr#[:1000]
Xtest = Xte#[:100]
Ytrain = Ytr#[:1000]
Ytest = Yte#[:100]
print('Train data shape:', Xtrain.shape)
print('Train Label shape:', Ytrain.shape)
print('Test data shape:', Xtest.shape)
print('Test Label shape:', Ytest.shape)
Xtrain = np.reshape(Xtrain,(50000, 32, 32, 3)).transpose(0,1,2,3).astype(float)
Xtest = np.reshape(Xtest,(10000, 32, 32, 3)).transpose(0,1,2,3).astype(float)
Xbatches = np.split(Xtrain, 500); #second number is # of batches
Ybatches = np.split(np.asarray(Ytrain), 500);
XtestB = np.split(Xtest, 100);
YtestB = np.split(Ytest, 100);
print('X # of batches:', len(Xbatches))
print('Y # of batches:', len(Ybatches))
# input X: 28x28 grayscale images, the first dimension (None) will index the images in the mini-batch
X = tf.placeholder(tf.float32, [100, 32, 32, 3])
# correct answers will go here
Y_ = tf.placeholder(tf.float32, [100, 100])
# weights W[784, 10] 784=28*28
W = tf.Variable(tf.zeros([3072, 100]))
# biases b[10]
b = tf.Variable(tf.zeros([100]))
# flatten the images into a single line of pixels
# -1 in the shape definition means "the only possible dimension that will preserve the number of elements"
XX = tf.reshape(X, [-1, 3072])
# The model
Y = tf.nn.softmax(tf.matmul(XX, W) + b)
# loss function: cross-entropy = - sum( Y_i * log(Yi) )
# Y: the computed output vector
# Y_: the desired output vector
# cross-entropy
# log takes the log of each element, * multiplies the tensors element by element
# reduce_mean will add all the components in the tensor
# so here we end up with the total cross-entropy for all images in the batch
cross_entropy = -tf.reduce_mean(Y_ * tf.log(Y)) * 1000.0 # normalized for batches of 100 images,
# *10 because "mean" included an unwanted division by 10
# accuracy of the trained model, between 0 (worst) and 1 (best)
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# training, learning rate = 0.005
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# init
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for i in range(500):
# the backpropagation training step
t, Loss = sess.run([train_step, cross_entropy], feed_dict={X: Xbatches[i], Y_: Ybatches[i]})
print(Loss)
print(i)
for i in range(100):
print('accuracy:', sess.run(accuracy, feed_dict={X: XtestB[i], Y_: YtestB[i]}))
You compute the accuracy a hundred times after the training process is completed. Nothing will change there. You should place your print('accuracy:'....) within the for loop in which you perform the backpropagation:
for i in range(500):
# the backpropagation training step
t, Loss = sess.run([train_step, cross_entropy], feed_dict={X: Xbatches[i], Y_: Ybatches[i]})
print(Loss)
print(i)
print('accuracy:', sess.run(accuracy, feed_dict={X: XtestB[i], Y_: YtestB[i]}))
Sorry for the post it turns out that it is a basic mistake.
I changed following;
Ytr[0:num_train, Yr[0:num_train]] = 1
Yte[0:num_test, Ye[0:num_test]] = 1
with
Ytr[range(num_train), Yr_temp[range(num_train)]] = 1
Yte[range(num_test), Ye_temp[range(num_test)]] = 1
First one make all values 1, but I just wanted to make indice of the true class 1 and other elements 0. Thanks for your time.