Running distributed Tensorflow with InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float - tensorflow

I have implemented a variational autoencoder with tensorflow on a single machine. Now I am trying to run it on my cluster with the distributed mechanism provided tensorflow. But the following problem had stuck me for several days.
Traceback (most recent call last):
File "/home/yama/mfs/ZhuSuan/examples/vae.py", line 265, in <module>
print('>> Test log likelihood = {}'.format(np.mean(test_lls)))
File "/usr/lib/python2.7/contextlib.py", line 35, in __exit__
self.gen.throw(type, value, traceback)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/training/supervisor.py", line 942, in managed_session
self.stop(close_summary_writer=close_summary_writer)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/training/supervisor.py", line 768, in stop
stop_grace_period_secs=self._stop_grace_secs)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/training/coordinator.py", line 322, in join
six.reraise(*self._exc_info_to_raise)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/training/coordinator.py", line 267, in stop_on_exception
yield
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/training/coordinator.py", line 411, in run
self.run_loop()
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/training/supervisor.py", line 972, in run_loop
self._sv.global_step])
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 372, in run
run_metadata_ptr)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 636, in _run
feed_dict_string, options, run_metadata)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 708, in _do_run
target_list, options, run_metadata)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 728, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors.InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:worker/replica:0/task:0/gpu:0"]()]]
[[Node: model_1/fully_connected_10/Relu_G88 = _Recv[client_terminated=false, recv_device="/job:worker/replica:0/task:0/cpu:0", send_device="/job:worker/replica:0/task:0/gpu:0", send_device_incarnation=3964479821165574552, tensor_name="edge_694_model_1/fully_connected_10/Relu", tensor_type=DT_FLOAT, _device="/job:worker/replica:0/task:0/cpu:0"]()]]
Caused by op u'Placeholder', defined at:
File "/home/yama/mfs/ZhuSuan/examples/vae.py", line 201, in <module>
x = tf.placeholder(tf.float32, shape=(None, x_train.shape[1]))
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 895, in placeholder
name=name)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1238, in _placeholder
name=name)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 704, in apply_op
op_def=op_def)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2260, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/mfs/yama/tensorflow/local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1230, in __init__
self._traceback = _extract_stack()
Here is my code, I just paste the main function for simplicity:
if __name__ == "__main__":
tf.set_random_seed(1234)
# Load MNIST
data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'data', 'mnist.pkl.gz')
x_train, t_train, x_valid, t_valid, x_test, t_test = \
dataset.load_mnist_realval(data_path)
x_train = np.vstack([x_train, x_valid])
np.random.seed(1234)
x_test = np.random.binomial(1, x_test, size=x_test.shape).astype('float32')
# Define hyper-parametere
n_z = 40
# Define training/evaluation parameters
lb_samples = 1
ll_samples = 5000
epoches = 10
batch_size = 100
test_batch_size = 100
iters = x_train.shape[0] // batch_size
test_iters = x_test.shape[0] // test_batch_size
test_freq = 10
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
# Create a cluster from the parameter server and worker hosts.
clusterSpec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
print("Create and start a server for the local task.")
# Create and start a server for the local task.
server = tf.train.Server(clusterSpec,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
print("Start ps and worker server")
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
#set distributed device
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=clusterSpec)):
print("Build the training computation graph")
# Build the training computation graph
x = tf.placeholder(tf.float32, shape=(None, x_train.shape[1]))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001, epsilon=1e-4)
with tf.variable_scope("model") as scope:
with pt.defaults_scope(phase=pt.Phase.train):
train_model = M1(n_z, x_train.shape[1])
train_vz_mean, train_vz_logstd = q_net(x, n_z)
train_variational = ReparameterizedNormal(
train_vz_mean, train_vz_logstd)
grads, lower_bound = advi(
train_model, x, train_variational, lb_samples, optimizer)
infer = optimizer.apply_gradients(grads)
print("Build the evaluation computation graph")
# Build the evaluation computation graph
with tf.variable_scope("model", reuse=True) as scope:
with pt.defaults_scope(phase=pt.Phase.test):
eval_model = M1(n_z, x_train.shape[1])
eval_vz_mean, eval_vz_logstd = q_net(x, n_z)
eval_variational = ReparameterizedNormal(
eval_vz_mean, eval_vz_logstd)
eval_lower_bound = is_loglikelihood(
eval_model, x, eval_variational, lb_samples)
eval_log_likelihood = is_loglikelihood(
eval_model, x, eval_variational, ll_samples)
global_step = tf.Variable(0)
saver = tf.train.Saver()
summary_op = tf.merge_all_summaries()
init_op = tf.initialize_all_variables()
# Create a "supervisor", which oversees the training process.
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
logdir=LogDir,
init_op=init_op,
summary_op=summary_op,
saver=saver,
global_step=global_step,
save_model_secs=600)
# Run the inference
with sv.managed_session(server.target) as sess:
epoch = 0
while not sv.should_stop() and epoch < epoches:
#for epoch in range(1, epoches + 1):
np.random.shuffle(x_train)
lbs = []
for t in range(iters):
x_batch = x_train[t * batch_size:(t + 1) * batch_size]
x_batch = np.random.binomial( n=1, p=x_batch, size=x_batch.shape).astype('float32')
_, lb = sess.run([infer, lower_bound], feed_dict={x: x_batch})
lbs.append(lb)
if epoch % test_freq == 0:
test_lbs = []
test_lls = []
for t in range(test_iters):
test_x_batch = x_test[
t * test_batch_size: (t + 1) * test_batch_size]
test_lb, test_ll = sess.run(
[eval_lower_bound, eval_log_likelihood],
feed_dict={x: test_x_batch}
)
test_lbs.append(test_lb)
test_lls.append(test_ll)
print('>> Test lower bound = {}'.format(np.mean(test_lbs)))
print('>> Test log likelihood = {}'.format(np.mean(test_lls)))
sv.stop()
I have try to correct my code for several days, but all my efforts have failed. Looking for your help!

The most likely cause of this exception is that one of the operations that the tf.train.Supervisor runs in the background depends on the tf.placeholder() tensor x, but doesn't have enough information to feed a value for it.
The most likely culprit is summary_op = tf.merge_all_summaries(), because library code often summarizes values that depend on the training data. To prevent the supervisor from collecting summaries in the background, pass summary_op=None to the tf.train.Supervisor constructor:
# Create a "supervisor", which oversees the training process.
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
logdir=LogDir,
init_op=init_op,
summary_op=None,
saver=saver,
global_step=global_step,
save_model_secs=600)
After doing this, you will need to make alternative arrangements to collect summaries. The easiest way to do this is to pass summary_op to sess.run() periodically, then pass the result to sv.summary_computed().

Came across a similar thing. The chief was going down with the aforementioned error message. However, since I was using the MonitoredTrainingSession rather than a self-made Supervisor, I was able to solve the problem by disabling the default summary. To disable, you have to provide
save_summaries_secs=None,
save_summaries_steps=None,
to the constructor of the MonitoredTrainingSession. Afterwards, everything went just smooth!
Code on Github

I had the same exact problem. Following mrry's suggestion I was able to work this out by:
Disabling summary logging in the supervisor by setting summary_op=None (as mrry suggested)
Creating my own summary_op and pass it to sess.run() along with the rest of the ops to be evaluated. Hold on the resulting summary, let's say it's called 'my_summary'.
Creating my own summary writer. Call it with 'my_summary', e.g.: summary_writer.add_summary(summary, epoch_count)
To clarify, I did not use mrry's suggestion to do
sess.run(summary_op) and sv.summary_computed(), but instead ran the summary_op along with the other operations, and then wrote out the summary myself. You might also want to condition the summary writing on being a chief.
So basically, you need to bypass the Supervisor's summary writing services completely. Seems like surprising limitation/bug of Supervisor since it isn't exactly uncommon to want to log things that depend on the input (which lives in a placeholder). For example in my network (an autoencoder) the cost depends on the input.

Related

Tensorflow Estimator.predict() fails

I am recreating the DnCNN, i.e. Gaussian Denoiser, which does image to image prediction with a series of convolutional layers. And it trains perfectly fine, but when i try to do the list(model.predict(..)),
i get the error:
Labels must not be none
I actually put all of the specs arguments of my EstimatorSpec explicitly in there, as they are lazily evaluated depending on the method (train/eval/predict) that is called upon the Estimator.
def DnCNN_model_fn (features, labels, mode):
# some convolutinons here
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=conv_last + input_layer,
loss=tf.losses.mean_squared_error(
labels=labels,
predictions=conv_last + input_layer),
train_op=tf.train.AdamOptimizer(learning_rate=0.001, epsilon=1e-08).minimize(
loss=tf.losses.mean_squared_error(
labels=labels,
predictions=conv_last + input_layer),
global_step=tf.train.get_global_step()),
eval_metric_ops={
"accuracy": tf.metrics.mean_absolute_error(
labels=labels,
predictions=conv_last + input_layer)}
)
Putting it into an estimator:
d = datetime.datetime.now()
DnCNN = tf.estimator.Estimator(
model_fn=DnCNN_model_fn,
model_dir=root + 'model/' +
"DnCNN_{}_{}_{}_{}".format(d.month, d.day, d.hour, d.minute),
config=tf.estimator.RunConfig(save_summary_steps=2,
log_step_count_steps=10)
)
After training the model i do the predictions as follows:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
x= test_data[0:2,:,:,:],
y= None,
batch_size=1,
num_epochs=1,
shuffle=False)
predicted = DnCNN.predict(input_fn=test_input_fn)
list(predicted) # this is where the error occurs
The traceback says, that tf.losses.mean_squared_error is causing this.
Traceback (most recent call last):
File "<input>", line 16, in <module>
File "...\venv2\lib\site-packages\tensorflow\python\estimator\estimator.py", line 551, in predict
features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
File "...\venv2\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1169, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "<input>", line 95, in DnCNN_model_fn
File "...\venv2\lib\site-packages\tensorflow\python\ops\losses\losses_impl.py", line 663, in mean_squared_error
raise ValueError("labels must not be None.")
ValueError: labels must not be None.
From estimator.predict raises "ValueError: None values not supported":
"In your model_fn, you define the loss in every mode (train / eval / predict). This means that even in predict mode, the labels will be used and need to be provided.
When you are in predict mode, you actually just need to return the predictions so you can return early from the function:"
def model_fn(features, labels, mode):
#...
y = ...
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=y)
#...
I am not entirly sure about what the exact error was, but i managed to get my model predicting.
what i changed (apart from adding the Batch norm UPDATE_OPS, which did not solve my issue) was short-circuiting (i.e. early & seperatly return) the tf.estimator.EstimatorSpec in case of tf.estimator.ModeKeys.PREDICT:
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=conv_last + input_layer
)
apparently there seems to be something wrong with the doc statement (or i did not understand it correctly) found at tf.estimator.EstimatorSpec :
model_fn can populate all arguments independent of mode. In this case, some arguments will be ignored by an Estimator. E.g. train_op will be ignored in eval and infer modes.
BTW: given mode is predict, at some point, labels are automatically replaced by None in any case.

Applying gradients with feed_dict for gradients

I want to do some non-tensorflow processing on the computed gradients, before applying them on the variables.
My plan was to run the gradient ops that I get from the compute_gradients function , do my processing (in python without tensorflow), and then run the apply operation I get from the apply_gradients function and feed the processed gradients in the feed_dict. Unfortunately, this doesn't work in my scenario.
I managed to narrow it down to some issue with tf.nn.embedding_lookup (same happens with tf.gather), and the error can be reproduced as follows (using tf1.4):
import tensorflow as tf
x = tf.placeholder(dtype=tf.float32, shape=[])
z = tf.placeholder(dtype=tf.int32, shape=[])
emb_mat = tf.get_variable('w', [100, 5], initializer=tf.truncated_normal_initializer(stddev=0.1))
emb = tf.nn.embedding_lookup(emb_mat, z)
loss = x - tf.reduce_sum(emb) # Just some silly loss
opt = tf.train.GradientDescentOptimizer(0.1)
grads_and_vars = opt.compute_gradients(loss, tf.trainable_variables())
train_op = opt.apply_gradients(grads_and_vars)
grads = [g for g,v in grads_and_vars]
tsess = tf.Session()
tsess.run(tf.global_variables_initializer())
gradsres = tsess.run(grads, {x: 1.0, z: 1})
tsess.run(train_op, {g:r for g,r in zip(grads, gradsres)})
which results in the error
Traceback (most recent call last):
File "/home/cruvadom/.p2/pool/plugins/org.python.pydev_6.0.0.201709191431/pysrc/_pydevd_bundle/pydevd_exec.py", line 3, in Exec
exec exp in global_vars, local_vars
File "<console>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 889, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1098, in _run
raise ValueError('Tensor %s may not be fed.' % subfeed_t)
ValueError: Tensor Tensor("gradients/Gather_1_grad/ToInt32:0", shape=(2,), dtype=int32, device=/device:GPU:0) may not be fed.
It seems there is some additional tensor I need to feed to the graph for the computation. What is the right way to do why I want to do?
Thanks!
If you run the training operation, it will automatically calculate the gradients. You can retrieve the gradients from the session:
tsess = tf.Session()
tsess.run(tf.global_variables_initializer())
_, grads_and_vars, loss = tsess.run([train_op ,grads, loss], {x: 1.0, z: 1})
assert not np.isnan(loss), 'Something wrong! loss is nan...'
#Get the gradients
for g, v in grads_and_vars:
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad_histogram".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))

Tensorflow does NOT utilize the memory from two GPUs in Windows 10

Tensorflow Version 1.3.0
OS: Windows 10
GPUs: Nvidia Quadro M4000 * 2 with 8G GPU memory for each
GPU modes: one for WDDM, one for TCC
I tested the official codes at https://github.com/tensorflow/models/blob/master/official/resnet/imagenet_main.py
I just add the GPU constraints in the main function as:
def main(unused_argv):
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
# For this line, visible_divice_list set to only "0" and "0, 1" can only support the same batch_size
config = tf.ConfigProto(gpu_options=tf.GPUOptions(visible_device_list='0, 1'))
resnet_classifier = tf.estimator.Estimator(
model_fn=imagenet_model_fn, model_dir=FLAGS.model_dir,
config=tf.contrib.learn.RunConfig(session_config=config))
for cycle in range(FLAGS.train_steps // FLAGS.steps_per_eval):
tensors_to_log = {
'learning_rate': 'learning_rate',
'cross_entropy': 'cross_entropy',
'train_accuracy': 'train_accuracy'
}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=100)
print('Starting a training cycle.')
resnet_classifier.train(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN),
steps=FLAGS.first_cycle_steps or FLAGS.steps_per_eval,
hooks=[logging_hook])
FLAGS.first_cycle_steps = None
print('Starting to evaluate.')
eval_results = resnet_classifier.evaluate(
input_fn=lambda: input_fn(tf.estimator.ModeKeys.EVAL))
print(eval_results)
In the training process, if I set the visible device list to "0, 1" or "0" only, both can run successfully with batch_size=48, but BOTH failed with batch_size=49! This indicates that the second GPU's memory is not utilized, as batch size could not be bigger when using two GPUs. I have use Nvidia-smi to confirm that only one or two GPUs are used in the above experiments.
My questions are:
Is there any way that I can use bigger batch_size when using two GPUs?
If the answer for Q1 is No in Windows, is there any way to do it in Linux? I am not familiar with Linux. In Linux, can I set all GPUs to TCC mode? Will the batch size be bigger when two GPUs are both in TCC mode?
Thank you.
-------------Update------------
I have tried to distributed the data batch on two GPUs and there is NaN loss error now. Would there be any possible cause for this? It runs well before (using one GPU only). But now even I set the _DEVICE_LIST to one GPU only, it still produce the NaN loss error.
My modified codes are:
def imagenet_model_fn(features, labels, mode):
tf.summary.image('images', features, max_outputs=6)
with tf.device('/cpu:0'):
split_batch = tf.split(features, len(_DEVICE_LIST))
split_labels = tf.split(labels, len(_DEVICE_LIST))
all_predictions = {
'classes': [],
'probabilities': []
}
all_cross_entropy = []
all_reg_loss = []
with tf.variable_scope(tf.get_variable_scope()):
for dev_idx, (device, device_features, device_labels) in enumerate(zip(
_DEVICE_LIST, split_batch, split_labels)):
with tf.device(device):
with tf.name_scope('device_%d' % dev_idx):
logits = network(inputs=device_features,
is_training=(mode == tf.estimator.ModeKeys.TRAIN))
tf.get_variable_scope().reuse_variables()
all_predictions['classes'].append(tf.argmax(logits, axis=1))
all_predictions['probabilities'].append(tf.nn.softmax(logits))
if mode == tf.estimator.ModeKeys.TRAIN:
# Calculate loss, which includes softmax cross entropy and L2 regularization.
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=device_labels)
reg_loss = FLAGS.weight_decay * tf.add_n(
[tf.nn.l2_loss(v) for v in tf.trainable_variables()])
all_cross_entropy.append(cross_entropy)
all_reg_loss.append(reg_loss)
all_predictions['classes'] = tf.reshape(all_predictions['classes'], [-1])
all_predictions['probabilities'] = tf.reshape(
all_predictions['probabilities'], [-1])
total_cross_entropy = tf.add_n(all_cross_entropy)
total_reg_loss = tf.add_n(all_reg_loss)
total_loss = total_cross_entropy + total_reg_loss
tf.identity(total_cross_entropy, name='cross_entropy')
tf.summary.scalar('cross_entropy', total_cross_entropy)
tf.summary.scalar('reg_loss', total_reg_loss)
tf.summary.scalar('total_loss', total_loss)
if mode == tf.estimator.ModeKeys.TRAIN:
global_step = tf.train.get_or_create_global_step()
boundaries = [
int(batches_per_epoch * epoch) for epoch in [30, 60, 120, 150]]
values = [
_INITIAL_LEARNING_RATE * decay for decay in [1, 0.1, 0.01, 1e-3, 1e-4]]
learning_rate = tf.train.piecewise_constant(
tf.cast(global_step, tf.int32), boundaries, values)
tf.identity(learning_rate, name='learning_rate')
tf.summary.scalar('learning_rate', learning_rate)
optimizer = tf.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=_MOMENTUM)
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(total_loss, global_step)
else:
train_op = None
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=all_predictions,
loss=total_loss,
train_op=train_op)
The error message is:
INFO:tensorflow:Saving checkpoints for 1 into F:\projects\DeepLearning\TensorFlow\Models\ImageNet\resnet_101_imagenet_augmented\temp\model.ckpt.
INFO:tensorflow:learning_rate = 0.003125, cross_entropy = 14.394
INFO:tensorflow:loss = 30.0782, step = 1
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "imagenet_main.py", line 321, in <module>
tf.app.run()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\platform\app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "imagenet_main.py", line 310, in main
hooks=[logging_hook])
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\estimator\estimator.py", line 686, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 518, in run
run_metadata=run_metadata)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 862, in run
run_metadata=run_metadata)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 818, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\monitored_session.py", line 980, in run
run_metadata=run_metadata))
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\training\basic_session_run_hooks.py", line 551, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.

why dataset.output_shapes returns demension(none) after batching

I'm using the Dataset API for input pipelines in TensorFlow (version: r1.2). I built my dataset and batched it with a batch size of 128. The dataset fed into the RNN.
Unfortunately, the dataset.output_shape returns dimension(none) in the first dimension, so the RNN raises an error:
Traceback (most recent call last):
File "untitled1.py", line 188, in <module>
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
File "/home/harold/anaconda2/envs/tensorflow_py2.7/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "untitled1.py", line 121, in main
run_training()
File "untitled1.py", line 57, in run_training
is_training=True)
File "/home/harold/huawei/ConvLSTM/ConvLSTM.py", line 216, in inference
initial_state=initial_state)
File "/home/harold/anaconda2/envs/tensorflow_py2.7/lib/python2.7/site-packages/tensorflow/python/ops/rnn.py", line 566, in dynamic_rnn
dtype=dtype)
File "/home/harold/anaconda2/envs/tensorflow_py2.7/lib/python2.7/site-packages/tensorflow/python/ops/rnn.py", line 636, in _dynamic_rnn_loop
"Input size (depth of inputs) must be accessible via shape inference,"
ValueError: Input size (depth of inputs) must be accessible via shape inference, but saw value None.
I think this error is caused by the shape of input, the first dimension should be batch size but not none.
here is the code:
origin_dataset = Dataset.BetweenS_Dataset(FLAGS.data_path)
train_dataset = origin_dataset.train_dataset
test_dataset = origin_dataset.test_dataset
shuffle_train_dataset = train_dataset.shuffle(buffer_size=10000)
shuffle_batch_train_dataset = shuffle_train_dataset.batch(128)
batch_test_dataset = test_dataset.batch(FLAGS.batch_size)
iterator = tf.contrib.data.Iterator.from_structure(
shuffle_batch_train_dataset.output_types,
shuffle_batch_train_dataset.output_shapes)
(images, labels) = iterator.get_next()
training_init_op = iterator.make_initializer(shuffle_batch_train_dataset)
test_init_op = iterator.make_initializer(batch_test_dataset)
print(shuffle_batch_train_dataset.output_shapes)
I print output_shapes and it gives:
(TensorShape([Dimension(None), Dimension(36), Dimension(100)]), TensorShape([Dimension(None)]))
I suppose that it should be 128, because I have batched dataset:
(TensorShape([Dimension(128), Dimension(36), Dimension(100)]), TensorShape([Dimension(128)]))
This feature has been added with the drop_remainder parameter used like the following:
batch_test_dataset = test_dataset.batch(FLAGS.batch_size, drop_remainder=True)
From the docs:
drop_remainder: (Optional.) A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case its has fewer than batch_size elements; the default behavior is not to drop the smaller batch.
They hardcoded batch size in implementation and it always will return None (tf 1.3).
def _padded_shape_to_batch_shape(s):
return tensor_shape.vector(None).concatenate(
tensor_util.constant_value_as_shape(s))
In this way, they can batch all elements (e.g. dataset_size=14, batch_size=5, last_batch_size=4).
You can use dataset.filter and dataset.map to fix this issue
d = contrib.data.Dataset.from_tensor_slices([[5] for x in range(14)])
batch_size = 5
d = d.batch(batch_size)
d = d.filter(lambda e: tf.equal(tf.shape(e)[0], batch_size))
def batch_reshape(e):
return tf.reshape(e, [args.batch_size] + [s if s is not None else -1 for s in e.shape[1:].as_list()])
d = d.map(batch_reshape)
r = d.make_one_shot_iterator().get_next()
print('dataset_output_shape = %s' % r.shape)
with tf.Session() as sess:
while True:
print(sess.run(r))
Output
dataset_output_shape = (5, 1)
[[5][5][5][5][5]]
[[5][5][5][5][5]]
OutOfRangeError

tensorflow : restore from checkpoint for continue training

in this case ,i want to continue train my model from checkpoint.i use the cifar-10 example and did a little change in cifar-10_train.py like below,they are almost the same,except i want to restore from checkpoint:
i replaced cifar-10 by md.
"""
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os.path
import time
import numpy
import tensorflow.python.platform
from tensorflow.python.platform import gfile
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import md
"""
"""
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('train_dir', '/root/test/INT/tbc',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 60000, # 55000 steps per epoch
"""Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
tf.app.flags.DEFINE_string('pretrained_model_checkpoint_path', '/root/test/INT/',
"""If specified, restore this pretrained model """
"""before beginning any training.""")
def error_rate(predictions, labels):
"""Return the error rate based on dense predictions and 1-hot labels."""
return 100.0 - (
100.0 *
numpy.sum(numpy.argmax(predictions, 0) == numpy.argmax(labels, 0)) /
predictions.shape[0])
def train():
"""Train MD65500 for a number of steps."""
with tf.Graph().as_default():
# global_step = tf.Variable(0, trainable=False)
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0), trainable=False)
# Get images and labels for CIFAR-10.
images, labels = md.distorted_inputs()
# Build a Graph that computes the logits predictions from the
# inference model.
logits = md.inference(images)
# Calculate loss.
loss = md.loss(logits, labels)
# Build a Graph that trains the model with one batch of examples and
# updates the model parameters.
train_op = md.train(loss, global_step)
# Predictions for the minibatch. there is no validation set or test set.
# train_prediction = tf.nn.softmax(logits)
train_prediction = logits
# Create a saver.
saver = tf.train.Saver(tf.all_variables())
# Build the summary operation based on the TF collection of Summaries.
summary_op = tf.merge_all_summaries()
# Build an initialization operation to run below.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
# sess = tf.Session(config=tf.ConfigProto(
# log_device_placement=FLAGS.log_device_placement))
# sess.run(init)
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
# sess.run(init)
if FLAGS.pretrained_model_checkpoint_path:
assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path)
# variables_to_restore = tf.get_collection(
# slim.variables.VARIABLES_TO_RESTORE)
variable_averages = tf.train.ExponentialMovingAverage(
md.MOVING_AVERAGE_DECAY)
variables_to_restore = {}
for v in tf.all_variables():
if v in tf.trainable_variables():
restore_name = variable_averages.average_name(v)
else:
restore_name = v.op.name
variables_to_restore[restore_name] = v
ckpt = tf.train.get_checkpoint_state(FLAGS.pretrained_model_checkpoint_path)
if ckpt and ckpt.model_checkpoint_path:
# global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
restorer = tf.train.Saver(variables_to_restore)
restorer.restore(sess, ckpt.model_checkpoint_path)
print('%s: Pre-trained model restored from %s' %
(datetime.now(), ckpt.model_checkpoint_path))
# print("variables_to_restore")
# print(variables_to_restore)
else:
sess.run(init)
# Start the queue runners.
tf.train.start_queue_runners(sess=sess)
summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
graph_def=sess.graph) #####graph_def=sess.graph_def)
# tf.add_to_collection('train_op', train_op)
for step in xrange(FLAGS.max_steps):
start_time = time.time()
_, loss_value, predictions = sess.run([train_op, loss, train_prediction])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 100 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print (format_str % (datetime.now(), step, loss_value,
examples_per_sec, sec_per_batch))
# print('Minibatch error: %.5f%%' % error_rate(predictions, labels))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
# Save the model checkpoint periodically.
if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
def main(argv=None): # pylint: disable=unused-argument
# md.maybe_download()
# if gfile.Exists(FLAGS.train_dir):
# gfile.DeleteRecursively(FLAGS.train_dir)
# gfile.MakeDirs(FLAGS.train_dir)
train()
if __name__ == '__main__':
tf.app.run()
when i run the code,errors like this:
[root#bogon md try]# pythonnew mdtbc_3.py
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcudnn.so locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
Filling queue with 4000 CIFAR images before starting to train. This will take a few minutes.
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:900] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties:
name: GeForce GTX 980 Ti
major: 5 minor: 2 memoryClockRate (GHz) 1.228
pciBusID 0000:01:00.0
Total memory: 6.00GiB
Free memory: 5.78GiB
I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 980 Ti, pci bus id: 0000:01:00.0)
2016-08-30 17:12:48.883303: Pre-trained model restored from /root/test/INT/model.ckpt-59999
WARNING:tensorflow:When passing a `Graph` object, please use the `graph` named argument instead of `graph_def`.
Traceback (most recent call last):
File "mdtbc_3.py", line 195, in <module>
tf.app.run()
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv))
File "mdtbc_3.py", line 191, in main
train()
File "mdtbc_3.py", line 160, in train
_, loss_value, predictions = sess.run([train_op, loss, train_prediction])
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 564, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 637, in _do_run
target_list, options, run_metadata)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 659, in _do_call
e.code)
tensorflow.python.framework.errors.FailedPreconditionError: Attempting to use uninitialized value conv2/weights
[[Node: conv2/weights/read = Identity[T=DT_FLOAT, _class=["loc:#conv2/weights"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2/weights)]]
Caused by op u'conv2/weights/read', defined at:
File "mdtbc_3.py", line 195, in <module>
tf.app.run()
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv))
File "mdtbc_3.py", line 191, in main
train()
File "mdtbc_3.py", line 77, in train
logits = md.inference(images)
File "/root/test/md try/md.py", line 272, in inference
stddev=0.1, wd=0.0)
File "/root/test/md try/md.py", line 114, in _variable_with_weight_decay
tf.truncated_normal_initializer(stddev=stddev))
File "/root/test/md try/md.py", line 93, in _variable_on_cpu
var = tf.get_variable(name, shape, initializer=initializer)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 339, in get_variable
collections=collections)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 262, in get_variable
collections=collections, caching_device=caching_device)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 158, in get_variable
dtype=variable_dtype)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 209, in __init__
dtype=dtype)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 318, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 609, in identity
return _op_def_lib.apply_op("Identity", input=input, name=name)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/pythonnew/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
self._traceback = _extract_stack()
when i uncomment the line 107 "sess.run(init)" ,it runs perfectly,but a initialised model,just a new one from sctrach. i want to restore variables from checkpoint , and continue my training.i want to restore.
Without having the rest of your code handy, I'd say that the following part is problematic:
for v in tf.all_variables():
if v in tf.trainable_variables():
restore_name = variable_averages.average_name(v)
else:
restore_name = v.op.name
variables_to_restore[restore_name] = v
Because you specify a list of variables you want to restore here, but you exclude some (i.e. the v.op.name for the ones in the trainable vars). That will change the name of the variable in the net that throws the error (again, without the rest of the code, I cannot really say), s.t. one (or more) vars are not restored properly. Two approaches (which are not very sophisticated) will help you here:
If you do not store all variables, do an initialization first, and then restore the variables you have actually stored. This makes sure that tensors you do not really care about get initialized none the less
TF is very efficient when it comes to storing nets. If in doubt, store all variables ...