Debug Gradient Computation Tensorflow - tensorflow

I am using a frozen graph to extract features and then want to train a predictor on top to perform some inference.
Unfortunately, the gradients can not be computed and my process is killed with RAM demands >100GB. I have checked several things:
1) Reducing input image sizes or batch sizes is not the problem.
2) I can use intermediate layers of my frozen network (Variant of ResNet) and performan training of my small inference net. But, using the later layers leads to huge memory demands (killed). This confuses me because I keep my network static and there are no trainable variables in the ResNet. Thus, I do not think the gradient should depend on the layer of my frozen net which i extract.
This behavior is unexpected to me. What are ways to debug what causes this huge memory demand when calling sess.run(train_op, feed_dict)?
More information:
tf.reset_default_graph()
graph1 = tf.Graph()
graph1.__enter__()
input_tensor = tf.placeholder('float', shape=input_shape, name='image')
# Loading frozen graph and mapping inputs
f = gfile.FastGFile(pb_file, 'rb')
graph_def = tf.GraphDef()
# Parses a serialized binary message into the current message.
graph_def.ParseFromString(f.read())
f.close()
_ = tf.import_graph_def(graph_def , input_map={'input0': input_tensor})
# Get feature layer
output_feature = 'import/layer3.00/add:0'
feature_tensor = graph.get_tensor_by_name(output_feature)
output = tf.contrib.layers.fully_connected(feature_tensor, 100, scope='readout_network)
def batch_data(batches):
# e.g. batches = [[1,2], [3,4]]
for batch in batches:
images = [stimuli.stimuli[n] for n in batch]
xs = []
ys = []
for i, n in enumerate(batch):
xs.append(this_cache['xs'][n])
ys.append(this_cache['ys'][n])
yield prepare_input(images, xs, ys)
loss = ...
params = slim.get_variables_to_restore(include=['readout_network'])
train_opt = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss, var_list=params)
for feed_dict in batch_data:
sess.run(train_op, feed_dict = feed_dict)

Related

pytorch isn't running on gpu while true

I want to train on my local gpu but it's only running on cpu while torch.cuda.is_available() is actually true and i can see my gpu but it runs only on cpu , so how to fix it
my CNN model:
import torch.nn as nn
import torch.nn.functional as F
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
# define the CNN architecture
class Net(nn.Module):
### TODO: choose an architecture, and complete the class
def __init__(self):
super(Net, self).__init__()
## Define layers of a CNN
self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
# convolutional layer (sees 16x16x16 tensor)
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
# convolutional layer (sees 8x8x32 tensor)
self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
# max pooling layer
self.pool = nn.MaxPool2d(2, 2)
# linear layer (64 * 4 * 4 -> 500)
self.fc1 = nn.Linear(64 * 28 * 28, 500)
# linear layer (500 -> 10)
self.fc2 = nn.Linear(500, 133)
# dropout layer (p=0.25)
self.dropout = nn.Dropout(0.25)
def forward(self, x):
## Define forward behavior
x = self.pool(F.relu(self.conv1(x)))
#print(x.shape)
x = self.pool(F.relu(self.conv2(x)))
#print(x.shape)
x = self.pool(F.relu(self.conv3(x)))
#print(x.shape)
#print(x.shape)
# flatten image input
x = x.view(-1, 64 * 28 * 28)
# add dropout layer
x = self.dropout(x)
# add 1st hidden layer, with relu activation function
x = F.relu(self.fc1(x))
# add dropout layer
x = self.dropout(x)
# add 2nd hidden layer, with relu activation function
x = self.fc2(x)
return x
#-#-# You so NOT have to modify the code below this line. #-#-#
# instantiate the CNN
model_scratch = Net()
# move tensors to GPU if CUDA is available
if use_cuda:
print("TRUE")
model_scratch = model_scratch.cuda()
train function :
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
"""returns trained model"""
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf
loaders_scratch = {'train': train_loader,'valid': valid_loader,'test': test_loader}
for epoch in range(1, n_epochs+1):
# initialize variables to monitor training and validation loss
train_loss = 0.0
valid_loss = 0.0
###################
# train the model #
###################
model.train()
for batch_idx, (data, target) in enumerate(loaders['train']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## find the loss and update the model parameters accordingly
## record the average training loss, using something like
## train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
# clear the gradients of all optimized variables
optimizer.zero_grad()
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# backward pass: compute gradient of the loss with respect to model parameters
loss.backward()
# perform a single optimization step (parameter update)
optimizer.step()
# update training loss
train_loss += loss.item()*data.size(0)
######################
# validate the model #
######################
model.eval()
for batch_idx, (data, target) in enumerate(loaders['valid']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## update the average validation loss
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# update average validation loss
valid_loss += loss.item()*data.size(0)
# calculate average losses
train_loss = train_loss/len(train_loader.dataset)
valid_loss = valid_loss/len(valid_loader.dataset)
# print training/validation statistics
print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
epoch,
train_loss,
valid_loss
))
## TODO: save the model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(
valid_loss_min,
valid_loss))
torch.save(model.state_dict(), save_path)
valid_loss_min = valid_loss
# return trained model
return model
# train the model
loaders_scratch = {'train': train_loader,'valid': valid_loader,'test': test_loader}
model_scratch = train(100, loaders_scratch, model_scratch, optimizer_scratch,
criterion_scratch, use_cuda, 'model_scratch.pt')
# load the model that got the best validation accuracy
model_scratch.load_state_dict(torch.load('model_scratch.pt'))
while i am getting "TRUE" in torch.cuda.is_available() but still not running on GPU
i am only running on CPU
the below picture shows that i am running on cpu with 62%
To utilize cuda in pytorch you have to specify that you want to run your code on gpu device.
a line of code like:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
will determine whether you have cuda available and if so, you will have it as your device.
later in the code you have to pass your tensors and model to this device:
net = net.to(device)
and do the same for your other tensors that need to go to gpu, like test and training values.
If you are experiencing an issue where your model is only using the CPU during training even though your GPU is available, it's likely due to the data loading and transformation process. When loading images from your local directory and applying transforms on your data, the majority of the time during training is spent on the data loading process, which is performed on the CPU.
To resolve this issue, you can preprocess your data by applying your custom transforms once and then saving the results. This way, when you load the preprocessed data, you can take advantage of the GPU's performance during training. This can help to significantly improve the training time of your model.
In summary, if you are facing a problem with model using CPU instead of GPU during training, it could be due to the data loading process. To fix this, preprocess your data and save the results, then use the preprocessed data while training. This will allow you to take advantage of the GPU's performance and reduce training time.

Simple softmax classifier in tensorflow

So I am trying to write a simple softmax classifier in TensorFlow.
Here is the code:
# Neural network parameters
n_hidden_units = 500
n_classes = 10
# training set placeholders
input_X = tf.placeholder(dtype='float32',shape=(None,X_train.shape[1], X_train.shape[2]),name="input_X")
input_y = tf.placeholder(dtype='int32', shape=(None,), name="input_y")
# hidden layer
dim = X_train.shape[1]*X_train.shape[2] # dimension of each traning data point
flatten_X = tf.reshape(input_X, shape=(-1, dim))
weights_hidden_layer = tf.Variable(initial_value=np.zeros((dim,n_hidden_units)), dtype ='float32')
bias_hidden_layer = tf.Variable(initial_value=np.zeros((1,n_hidden_units)), dtype ='float32')
hidden_layer_output = tf.nn.relu(tf.matmul(flatten_X, weights_hidden_layer) + bias_hidden_layer)
# output layer
weights_output_layer = tf.Variable(initial_value=np.zeros((n_hidden_units,n_classes)), dtype ='float32')
bias_output_layer = tf.Variable(initial_value=np.zeros((1,n_classes)), dtype ='float32')
output_logits = tf.matmul(hidden_layer_output, weights_output_layer) + bias_output_layer
predicted_y = tf.nn.softmax(output_logits)
# loss
one_hot_labels = tf.one_hot(input_y, depth=n_classes, axis = -1)
loss = tf.losses.softmax_cross_entropy(one_hot_labels, output_logits)
# optimizer
optimizer = tf.train.MomentumOptimizer(0.01, 0.5).minimize(
loss, var_list=[weights_hidden_layer, bias_hidden_layer, weights_output_layer, bias_output_layer])
This compiles, and I have checked the shape of all the tensor and it coincides with what I expect.
However, I tried to run the optimizer using the following code:
# running the optimizer
s = tf.InteractiveSession()
s.run(tf.global_variables_initializer())
for i in range(5):
s.run(optimizer, {input_X: X_train, input_y: y_train})
loss_i = s.run(loss, {input_X: X_train, input_y: y_train})
print("loss at iter %i:%.4f" % (i, loss_i))
And the loss kept being the same in all iterations!
I must have messed up something, but I fail to see what.
Any ideas? I also appreciate if somebody leaves comments regarding code style and/or tensorflow tips.
You have made a mistake. You are initializing your weights using np.zeros. Use np.random.normal. You can choose mean for this Gaussian Distribution by using number of inputs going to a particular neuron. You can read more about it here.
The reason that you want to initialize with Gaussian Distribution is because you want to break symmetry. If all the weights are initialized by zero, then you can use backpropogation to see that all the weights will evolved same.
One could visualize the weight histogram using TensorBoard to make it easier. I executed your code for this. A few more lines are needed to set up Tensorboard logging but the histogram summary of weights can be easily added.
Initialized to zeros
weights_hidden_layer = tf.Variable(initial_value=np.zeros((784,n_hidden_units)), dtype ='float32')
tf.summary.histogram("weights_hidden_layer",weights_hidden_layer)
Xavier initialization
initializer = tf.contrib.layers.xavier_initializer()
weights_hidden_layer = tf.Variable(initializer(shape=(784,n_hidden_units)), dtype ='float32')
tf.summary.histogram("weights_hidden_layer",weights_hidden_layer)

In Tensorflow, how to use a restored meta-graph if the meta graph was feeding with TFRecord input (without placeholders)

I trained a network with TFRecord input pipeline. In other words, there was no placeholders. Simple example would be:
input, truth = _get_next_batch() # TFRecord. `input` is not a tf.placeholder
net = Model(input)
net.set_loss(truth)
optimizer = tf...(net.loss)
Let's say, I acquired three files, ckpt-20000.meta, ckpt-20000.data-0000-of-0001, ckpt-20000.index. I understood that, later one can import the meta-graph using the .meta file and access tensors such as:
new_saver = tf.train.import_meta_graph('ckpt-20000.meta')
new_saver.restore(sess, 'ckpt-20000')
logits = tf.get_collection("logits")[0]
However, the meta-graph does not have a placeholder from the beginning in the pipeline. Is there a way that I can use meta-graph and query inference of an input?
For information, in a query application (or a script), I used to define a model with a placeholder and restored model weights (see below). I am wondering if I can just utilize the meta-graph without re-definition since it would be much more simple.
input = tf.placeholder(...)
net = Model(input)
tf.restore(sess, 'ckpt-2000')
lgt = sess.run(net.logits, feed_dict = {input:img})
You can build a graph that uses placeholder_with_default() for the inputs, so can use both TFRecord input pipeline as well as feed_dict{}.
An example:
input, truth = _get_next_batch()
_x = tf.placeholder_with_default(input, shape=[...], name='input')
_y = tf.placeholder_with_default(truth, shape-[...], name='label')
net = Model(_x)
net.set_loss(_y)
optimizer = tf...(net.loss)
Then during inference,
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
new_saver = tf.train.import_meta_graph('ckpt-20000.meta')
new_saver.restore(sess, 'ckpt-20000')
# Get the tensors by their variable name
input = loaded_graph.get_tensor_by_name('input:0')
logits = loaded_graph.get_tensor_by_name(...)
# Now you can feed the inputs to your tensors
lgt = sess.run(logits, feed_dict = {input:img})
In the above example, if you don't feed input, then the input will be read from the TFRecord input pipeline.
Is there a way to do it without placeholders at test though? It should be possible to re-use the graph with a new input pipeline without resorting to slow placeholders (i.e. the test dataset may be very large). placeholder_with_default is a suboptimal solution in that case.
The recommended way is saving two meta graphs. One is for Training/Validation/Testing, and the other one is for inference.
see Building a SavedModel
export_dir = ...
...
builder = tf.saved_model_builder.SavedModelBuilder(export_dir)
with tf.Session(graph=tf.Graph()) as sess:
...
builder.add_meta_graph_and_variables(sess,
[tag_constants.TRAINING],
signature_def_map=foo_signatures,
assets_collection=foo_assets)
...
# Add a second MetaGraphDef for inference.
with tf.Session(graph=tf.Graph()) as sess:
...
builder.add_meta_graph([tag_constants.SERVING])
...
builder.save()
The NMT tutorial also provides a detailed example about creating multiple graphs with shared variables: Neural Machine Translation (seq2seq) Tutorial-Building Training, Eval, and Inference Graphs
train_graph = tf.Graph()
eval_graph = tf.Graph()
infer_graph = tf.Graph()
with train_graph.as_default():
train_iterator = ...
train_model = BuildTrainModel(train_iterator)
initializer = tf.global_variables_initializer()
with eval_graph.as_default():
eval_iterator = ...
eval_model = BuildEvalModel(eval_iterator)
with infer_graph.as_default():
infer_iterator, infer_inputs = ...
infer_model = BuildInferenceModel(infer_iterator)
checkpoints_path = "/tmp/model/checkpoints"
train_sess = tf.Session(graph=train_graph)
eval_sess = tf.Session(graph=eval_graph)
infer_sess = tf.Session(graph=infer_graph)
train_sess.run(initializer)
train_sess.run(train_iterator.initializer)
for i in itertools.count():
train_model.train(train_sess)
if i % EVAL_STEPS == 0:
checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=i)
eval_model.saver.restore(eval_sess, checkpoint_path)
eval_sess.run(eval_iterator.initializer)
while data_to_eval:
eval_model.eval(eval_sess)
if i % INFER_STEPS == 0:
checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=i)
infer_model.saver.restore(infer_sess, checkpoint_path)
infer_sess.run(infer_iterator.initializer, feed_dict={infer_inputs: infer_input_data})
while data_to_infer:
infer_model.infer(infer_sess)

Tensorflow: proper queueing/batching structure using training and validation set

I am trying to replicate the structure used in the TensorBoard MNIST example from the recent 2017 dev summit (code found here). In it, feed_dict's are used to alternate between training and validation sets; however, they use the very non-transparent mnist.train.next_batch, which makes it really difficult to iterate your own off of.
Admittedly, this may also be because I'm struggling to understand the queueing implementation in Tensorflow, and explicit examples seem to be in short supply, especially for TF > v1.0.
I've made my own attempt at an image-classifying CNN based on various examples I stumbled across. Originally I had it working with just training data by storing the data in pre-loaded variables (its a small data set). I assumed it would be easier to get the train/valid swap working via feeding data from filenames so I tried to change it to that.
Between changing the format and trying to implement the feed_dict train/valid structure, I get the following -
Error: "You must feed a value for placeholder tensor 'input/Placeholder_2' with dtype string".
Any tips as to how to get it working or further explanation as to how the slicer/train.batch/QueueRunner actually work together would be of great help, as I have found the Tensorflow tutorial to be lacking in terms of explaining the basic workflow between them.
I have a feeling I have the train.batch in the completely wrong spot and that it should probably be in the feed_dict def, but no idea otherwise. Thanks!
import tensorflow as tf
from tensorflow.python.framework import dtypes
# Input - 216x216x1 images; ~900 training images, ~350 validation
# Want to do batches of 5 for training, 20 for validation
learn_rate = .0001
drop_keep = 0.9
train_batch = 5
test_batch = 20
epochs = 1
iterations = int((885/train_batch) * epochs)
#
#
# A BUNCH OF (graph-building) HELPER DEFINITIONS EXCLUDED FOR BREVITY
#
#
#x_init will be fed a list of .jpg filenames (ex: [/file0.jpg, /file1.jpg, ...])
#y_init will be fed an array of one-hot classes (ex: [[0,1,0], [1,0,0], ...])
sess = tf.InteractiveSession()
with tf.name_scope('input'):
batch_size = tf.placeholder(tf.int32)
keep_prob = tf.placeholder(tf.float32)
x_init = tf.placeholder(dtype=tf.string, shape=(None))
y_init = tf.placeholder(dtype=np.int32, shape=(None,3)) #3 classes
image, label = tf.train.slice_input_producer([x_init, y_init])
file = tf.read_file(image)
image = tf.image.decode_jpeg(file, channels=1)
image = tf.cast(image, tf.float32)
image.set_shape([216,216,1])
label = tf.cast(label, tf.int32)
images, labels = tf.train.batch([image, label], batch_size=batch_size)
conv1 = conv_layer(images, [5,5,1], 40, 'conv1')
#
#
# skip the rest of graph defining/functions (merged,train_step)
# very similar to what is found in the MNIST example.
#
#
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(OUTPUT_LOC + '/train',sess.graph)
test_writer = tf.summary.FileWriter(OUTPUT_LOC + '/test')
sess.run(tf.global_variables_initializer())
#xTrain, yTrain, xTest, yTest are the train/valid images/labels lists
def feed_dict(train=True):
if train:
batch = train_batch
keep = drop_keep
xval = xTrain
yval = yTrain
else:
batch = test_batch
keep = 1
xval = xTest
yval = yTest
return({x_init:xval, y_init:yval, batch_size:batch, keep_prob:keep})
#If I run "threads", I get the error. It works up until here.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
#Don't know what works here or what doesn't.
for i in range(iterations):
if i % 10 == 0:
summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
test_writer.add_summary(summary, i)
print('Accuracy at step %s: %s' % (i, acc))
else:
if i % 100 == 99:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata)
train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
train_writer.add_summary(summary, i)
print('Adding run metadata for', i)
else: # Record a summary
summary, _ = sess.run([merged, train_step],feed_dict=feed_dict(True))
train_writer.add_summary(summary, i)
coord.request_stop()
train_writer.close()
test_writer.close()
sess.close()

tensorflow - run inference(feed forwarding) twice on same image and model, ends up with different network output

something weird is happening to me. I run my feed-forwarding function in tensorflow twice loading the same model and on the same image, but I saw different network output.
here I build the graph
image = tf.placeholder(tf.float32, shape=[227,227,3])
output = train.forward(img, net)
and then I load in the model
sess = tf.Session()
variable_averages = tf.train.ExponentialMovingAverage(0.9999)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
saver.restore(sess, 'train/model.ckpt-20000')
and then I do the simple evaluation
imgdata = scipy.misc.imread(imagefile)
out_result = sess.run(output, feed_dict={image:imgdata})
I run above and each run I got different out_result value.
here is how I design the farward function (read in a list of network layers and construct graph in a loop)
# train.py
def forward(images, net):
"""
network feed-forwarding
Args:
images: input batch of images
Returns:
feed-forward ouput before softmax activation
"""
acts = []
firstFC = True
for layerIndex, layer in enumerate(net.layers):
if isinstance(layer, ConvLayer):
if len(acts) == 0:
acts.append(convop(images, layer.nfilters, layer.filter_size, layer.stride_size,
layer.padding, layer.init, layer.activation, layerIndex))
else:
acts.append(convop(acts[-1], layer.nfilters, layer.filter_size, layer.stride_size,
layer.padding, layer.init, layer.activation, layerIndex))
if isinstance(layer, PoolLayer):
acts.append(maxpoolop(acts[-1], layer.pool_size, layer.stride_size, layer.padding, layerIndex))
if isinstance(layer, LRNLayer):
acts.append(lrnop(acts[-1], layer.depth_radius, layer.bias, layer.alpha, layer.beta, layerIndex))
if isinstance(layer, FCLayer):
if firstFC:
indim = np.prod(np.array(acts[-1].get_shape() [1:].as_list()))
sample_size = acts[-1].get_shape().as_list()[0]
reshape = tf.reshape(acts[-1], [sample_size, indim])
acts.pop()
acts.append(reshape)
firstFC = False
if layerIndex == len(net.layers) - 1:
acts.append(fcop(acts[-1], layer.layer_size, layer.init, layer.activation, layerIndex, isOut=True))
else:
acts.append(fcop(acts[-1], layer.layer_size, layer.init, layer.activation,
layerIndex, dropout=layer.dropout, wdecay=layer.l2_norm_wd))
return acts[-1]
def convop(inpOp, num_outfmap, filter_size, stride_size, padType, init, act, layerIndex):
def lrnop(inpOp, depth_radius, bias, alpha, beta, layerIndex):
def fcop(inpOp, nOut, init, act, layerIndex, isOut=False, dropout=1.0, wdecay=0.0):
def maxpoolop(inpOp, filter_size, stride_size, padType, layerIndex):
I am not sure what might goes wrong. any idea?
many thanks!
It was certainly due to dropout being activated at inference. If it is useful for reducing overfitting while training, you have to deactivate dropout at inference to use the whole network. Otherwise a random subpart of your network will be activated leading to non-reproductible outputs.
Stumbled upon your question a bit late, sorry.