TensorFlow: InternalError: Blas SGEMM launch failed - tensorflow

When I run sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) I get InternalError: Blas SGEMM launch failed. Here is the full error and stack trace:
InternalErrorTraceback (most recent call last)
<ipython-input-9-a3261a02bdce> in <module>()
1 batch_xs, batch_ys = mnist.train.next_batch(100)
----> 2 sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
338 try:
339 result = self._run(None, fetches, feed_dict, options_ptr,
--> 340 run_metadata_ptr)
341 if run_metadata:
342 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
562 try:
563 results = self._do_run(handle, target_list, unique_fetches,
--> 564 feed_dict_string, options, run_metadata)
565 finally:
566 # The movers are no longer used. Delete them.
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
635 if handle is None:
636 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
--> 637 target_list, options, run_metadata)
638 else:
639 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
657 # pylint: disable=protected-access
658 raise errors._make_specific_exception(node_def, op, error_message,
--> 659 e.code)
660 # pylint: enable=protected-access
661
InternalError: Blas SGEMM launch failed : a.shape=(100, 784), b.shape=(784, 10), m=100, n=10, k=784
[[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](_recv_Placeholder_0/_4, Variable/read)]]
Caused by op u'MatMul', defined at:
File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 596, in launch_instance
app.start()
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 442, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 162, in start
super(ZMQIOLoop, self).start()
File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 883, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 391, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 199, in do_execute
shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2825, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-4-d7414c4b6213>", line 4, in <module>
y = tf.nn.softmax(tf.matmul(x, W) + b)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1036, in matmul
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 911, in _mat_mul
transpose_b=transpose_b, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
self._traceback = _extract_stack()
Stack: EC2 g2.8xlarge machine, Ubuntu 14.04

Old question, but may help others.
Try to close interactive sessions active in other processes (if IPython Notebook - just restart kernels). This helped me!
Additionally, I use this code to close local sessions in this kernel during experiments:
if 'session' in locals() and session is not None:
print('Close interactive session')
session.close()

I encountered this problem and solved it by setting allow_soft_placement=True and gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3), which specifically define the fraction of memory of GPU been used. I guess this has helped to avoid two tensorflow processes competing for the GPU memory.
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True))

I got this error when running Tensorflow Distributed. Did you check if any of the workers were reporting CUDA_OUT_OF_MEMORY errors? If this is the case it may have to do with where you place your weight and bias variables. E.g.
with tf.device("/job:paramserver/task:0/cpu:0"):
W = weight_variable([input_units, num_hidden_units])
b = bias_variable([num_hidden_units])

My environment is Python 3.5, Tensorflow 0.12 and Windows 10 (no Docker). I am training neural networks in both CPU and GPU. I came across the same error InternalError: Blas SGEMM launch failed whenever training in the GPU.
I could not find the reason why this error happens but I managed to run my code in the GPU by avoiding the tensorflow function tensorflow.contrib.slim.one_hot_encoding(). Instead, I do the one-hot-encoding operation in numpy (input and output variables).
The following code reproduces the error and the fix. It is a minimal setup to learn the y = x ** 2 function using gradient descent.
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
def test_one_hot_encoding_using_tf():
# This function raises the "InternalError: Blas SGEMM launch failed" when run in the GPU
# Initialize
tf.reset_default_graph()
input_size = 10
output_size = 100
input_holder = tf.placeholder(shape=[1], dtype=tf.int32, name='input')
output_holder = tf.placeholder(shape=[1], dtype=tf.int32, name='output')
# Define network
input_oh = slim.one_hot_encoding(input_holder, input_size)
output_oh = slim.one_hot_encoding(output_holder, output_size)
W1 = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))
output_v = tf.matmul(input_oh, W1)
output_v = tf.reshape(output_v, [-1])
# Define updates
loss = tf.reduce_sum(tf.square(output_oh - output_v))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
update_model = trainer.minimize(loss)
# Optimize
init = tf.initialize_all_variables()
steps = 1000
# Force CPU/GPU
config = tf.ConfigProto(
# device_count={'GPU': 0} # uncomment this line to force CPU
)
# Launch the tensorflow graph
with tf.Session(config=config) as sess:
sess.run(init)
for step_i in range(steps):
# Get sample
x = np.random.randint(0, 10)
y = np.power(x, 2).astype('int32')
# Update
_, l = sess.run([update_model, loss], feed_dict={input_holder: [x], output_holder: [y]})
# Check model
print('Final loss: %f' % l)
def test_one_hot_encoding_no_tf():
# This function does not raise the "InternalError: Blas SGEMM launch failed" when run in the GPU
def oh_encoding(label, num_classes):
return np.identity(num_classes)[label:label + 1].astype('int32')
# Initialize
tf.reset_default_graph()
input_size = 10
output_size = 100
input_holder = tf.placeholder(shape=[1, input_size], dtype=tf.float32, name='input')
output_holder = tf.placeholder(shape=[1, output_size], dtype=tf.float32, name='output')
# Define network
W1 = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))
output_v = tf.matmul(input_holder, W1)
output_v = tf.reshape(output_v, [-1])
# Define updates
loss = tf.reduce_sum(tf.square(output_holder - output_v))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
update_model = trainer.minimize(loss)
# Optimize
init = tf.initialize_all_variables()
steps = 1000
# Force CPU/GPU
config = tf.ConfigProto(
# device_count={'GPU': 0} # uncomment this line to force CPU
)
# Launch the tensorflow graph
with tf.Session(config=config) as sess:
sess.run(init)
for step_i in range(steps):
# Get sample
x = np.random.randint(0, 10)
y = np.power(x, 2).astype('int32')
# One hot encoding
x = oh_encoding(x, 10)
y = oh_encoding(y, 100)
# Update
_, l = sess.run([update_model, loss], feed_dict={input_holder: x, output_holder: y})
# Check model
print('Final loss: %f' % l)

maybe you not free your gpu rigthly , if you are using linux,try "ps -ef | grep python" to see what jobs are using GPU. then kill them

In my case, I had 2 python consoles open, both using keras/tensorflow.
As I closed the old console (forgotten from previous day),
everything started to work correctly.
So it is good to check, if you do not have multiple consoles / processes occupying GPU.

I closed all other Jupyter Sessions running and this solved the problem. I think It was GPU memory issue.

In my case,
First, I run
conda clean --all
to clean up tarballs and unused packages.
Then, I restart IDE (Pycharm in this case) and it works well. Environment: anaconda python 3.6, windows 10 64bit. I install tensorflow-gpu by a command provided on the anaconda website.

For me, I got this problem when I tried to run multiple tensorflow processes (e.g. 2) and both of them require to access GPU resources.
A simple solution is to make sure there has to be only one tensorflow process running at a single time.
For more details, you can see here.
To be clear, tensorflow will try (by default) to consume all available
GPUs. It cannot be run with other programs also active. Closing. Feel
free to reopen if this is actually another problem.

2.0 Compatible Answer: Providing 2.0 Code for erko's answer for the benefit of the Community.
session = tf.compat.v1.Session()
if 'session' in locals() and session is not None:
print('Close interactive session')
session.close()

In my case, the network filesystem under which libcublas.so was located simply died. The node was rebooted and everything was fine. Just to add another point to the dataset.

I encountered this error when running Keras CuDNN tests in parallel with pytest-xdist. The solution was to run them serially.

For me, I got this error when using Keras, and Tensorflow was the the backend. It was because the deep learning environment in Anaconda was not activated properly, as a result, Tensorflow didn't kick in properly either. I noticed this since the last time I activated my deep learning environment (which is called dl), the prompt changed in my Anaconda Prompt to this:
(dl) C:\Users\georg\Anaconda3\envs\dl\etc\conda\activate.d>set "KERAS_BACKEND=tensorflow"
While it only had the dl before then. Therefore, what I did to get rid of the above error was to close my jupyter notebook and Anaconda prompt, then relaunch, for several times.

I encountered this error after changing OS to Windows 10 recently, and I never encountered this before when using windows 7.
The error occurs if I load my GPU Tensorflow model when an another GPU program is running; it's my JCuda model loaded as socket server, which is not large. If I close my other GPU program(s), this Tensorflow model can be loaded very successfully.
This JCuda program is not large at all, just around 70M, and in comparison this Tensorflow model is more than 500M and much larger. But I am using 1080 ti, which has much memory. So it would be probably not an out-of-memory progblem, and it would perhaps be some tricky internal issue of Tensorflow regarding OS or Cuda. (PS: I am using Cuda version 8.0.44 and haven't downloaded a newer version.)

Restarting my Jupyter processes wasn't enough; I had to reboot my computer.

In my case, it is enough to open the Jupyter Notebooks in separate servers.
This error only occurs with me if I try using more than one tensorflow/keras model in the same server. It doesn't matter if open one notebook, execute it, than close and try opening another. If they are being loaded in the same Jupyter server the error always happens.

Related

slim.dataset_data_provider.DatasetDataProvider with num_epochs=1 throws error

I am using the relatively new tf.slim Dataset, DatasetDataProvider pattern. The following code shows the key fragments:
with tf.Graph().as_default():
# get the dataset split
dataset = util.get_split(train_or_eval,
args.tfrecord_folder,
0,
args.eval_set_size,
crop_size,
file_pattern=file_pattern)
features, labels = util.load_batch(dataset,
batch_size=args.eval_batch_size,
num_readers=10,
num_epochs=1,
is_training=True)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
# start the queue runner
with slim.queues.QueueRunners(sess):
...run some ops...
Here's the definition of load_batch:
def load_batch(dataset, batch_size=64, is_training=False,
num_epochs=None, common_queue_capacity=256,
common_queue_min=32, num_readers=None):
shuffle = True
# create the data provider
data_provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
num_readers=num_readers,
shuffle=shuffle,
num_epochs=num_epochs,
common_queue_capacity=
common_queue_capacity,
common_queue_min= common_queue_min,
seed=5)
# get the tensors from the data provider
images, labels = data_provider.get(['image_raw','label'])
# batch up some training data
images, labels = tf.train.batch([image_raw, label],
batch_size=batch_size,
num_threads=5,
allow_smaller_final_batch=True,
capacity=2 * batch_size)
return images, labels
This works fine when num_epochs=None (which according to the comments in the source means that a file of tfrecords can be read an infinite number of times), but fails when num_epochs=1. Here's the error message:
Out of range: FIFOQueue '_9_batch/fifo_queue' is closed and has insufficient elements (requested 32, current size 0)
Obviously, I need to be able to run an eval step without repeating the examples to get good accuracy and confusion matrix numbers. Any thoughts would be appreciated...
Per the request in the comments I am adding the stack trace. I am running this job in Google Cloud ML so its easiest to show it this way. The logs have a series of paired messages as follows:
Out of range: FIFOQueue '_6_batch/fifo_queue' is closed and has
insufficient elements (requested 32, current size 0)[[Node: batch =
QueueDequeueUpToV2[component_types=[DT_UINT8, DT_INT64, DT_STRING,
DT_STRING], timeout_ms=-1,
_device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
[[Node: batch =
QueueDequeueUpToV2[component_types=[DT_UINT8, DT_INT64, DT_STRING,
DT_STRING], timeout_ms=-1,
_device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
Final Stack Trace is
"The replica master 0 exited with a non-zero status of 1. Termination
reason: Error.Traceback (most recent call last): [...] File
"/root/.local/lib/python2.7/site-packages/trainer/task.py", line 509,
in
main() File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 505,
in main
run() File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 113,
in run
run_eval(args) File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 285,
in run_eval
is_training=True) File "/root/.local/lib/python2.7/site-packages/trainer/util.py", line 210,
in load_batch
capacity=3 * batch_size) File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/input.py",
line 872, in batch
name=name) File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/input.py",
line 665, in _batch
dequeued = queue.dequeue_up_to(batch_size, name=name) File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/data_flow_ops.py",
line 499, in dequeue_up_to
self._queue_ref, n=n, component_types=self._dtypes, name=name) File
"/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_data_flow_ops.py",
line 1402, in _queue_dequeue_up_to_v2
timeout_ms=timeout_ms, name=name) File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py",
line 763, in apply_op
op_def=op_def) File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py",
line 2327, in create_op
original_op=self._default_original_op, op_def=op_def) File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py",
line 1226, in init
self._traceback = _extract_stack()
OutOfRangeError (see above for traceback): FIFOQueue
'_6_batch/fifo_queue' is closed and has insufficient elements
(requested 32, current size 0) [[Node: batch =
QueueDequeueUpToV2[component_types=[DT_UINT8, DT_INT64, DT_STRING,
DT_STRING], timeout_ms=-1,
_device="/job:localhost/replica:0/task:0/cpu:0"](batch/fifo_queue, batch/n)]]
To find out more about why your job exited please check the logs:
https://console.cloud.google.com/logs/viewer?...
After extensive study and reading on Github, many reported that eliminating this issue was a matter of making sure that the initializer for local and global variables is run at the top of the session. Like this one Using the following:
tf.group(tf.local_variables_initializer(), tf.global_variables_initializer{}
However, that did not fix the issue for many (including me), and I suspect for those that it did work, there were other problems leading to an empty FIFO queue.
After much reading, it appears that this is a defect for which there is not an obvious fix. Several work arounds are proposed. I was running a full cycle of train, eval, and predict. Here is the approach which worked for me:
1) On training, I set num_epochs=None. This cycles through the data an infinite number of times and if the documentation is correct, each example is presented only once per epoch. I did spot checking to confirm this, but my dataset was too large to guarantee the docs are correct. That said, my model did not overfit. Train, test, and validation were all reasonably close in terms of accuracy.
2) On eval, I was building a 15 model ensemble and I wanted to compare the proposal selection to ground truth before submitting unlabeled data for validation. I kept an extra hold out set from a k-fold cross validation run and needed to be sure that the each example in the hold out set was predicted once and only once. So to make that work, I: a)set num_epochs=1, b) eliminated all calculations from the eval graph except the prediction, c) reduced the size of the eval set to ~3000 examples, d) set shuffle_batch=False, e) set the batch size so that the queue would have a few extra examples
With these conditions, the queue runners did not run out of examples before my graph completed and I got my test set
3) On predict, I used the same technique again as for eval except that I chose a batch size and number of train steps that was exactly equal to the number of predict records. Since there was no gradient back prop, the predicts were fast enough to finish before the queue runner could kill my job.
Problem solved. Jury rigged. But, it worked. Desperation is the mother of ingenuity or something like that!

Receiving Negative Input Dimensions with TensorFlow MonitoredTrainingSession

I'm attempting to switch from a tf.Session() to a tf.train.MonitoredTrainingSession (all on one machine, no fancy distributed computing), but I'm getting an error that I don't fully understand.
W tensorflow/core/framework/op_kernel.cc:1148] Invalid argument: Shape [16,-1,4] has negative dimensions
E tensorflow/core/common_runtime/executor.cc:644] Executor failed to create kernel. Invalid argument: Shape [16,-1,4] has negative dimensions
[[Node: define_inputs/Placeholder = Placeholder[dtype=DT_FLOAT, shape=[16,?,4], _device="/job:local/replica:0/task:0/cpu:0"]()]]
Further down, I receive a little more information about the error:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1139, in _do_call
return fn(*args)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1121, in _run_fn
status, run_metadata)
File "/usr/local/Cellar/python3/3.6.1/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 89, in __exit__
next(self.gen)
File "/usr/local/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status))
I'm using tf.contrib.seq2seq and my input and output sequences have variable lengths e.g. x_placeholder = tf.placeholder(tf.float32, [batch_size, None, 4]).
I suspect that the queues that I'm using to read data and bucket data by sequence length are somehow failing or getting interrupted by the MonitoredTrainingSession, as I don't have this problem with a vanilla Session.
Here's the code that sets up the MonitoredTrainingSession
# create a global step
global_step = tf.contrib.framework.get_or_create_global_step()
# define graph
model = import_model(global_step)
# create a one process cluster with an in-process server
server = tf.train.Server.create_local_server()
# define hooks for writing summaries and model variables to disk
hooks = construct_training_hooks(model.summary_op)
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=True,
hooks=hooks) as monitored_sess:
# create coordinator to handle threading
coord = tf.train.Coordinator()
# start threads to enqueue input minibatches for training
threads = tf.train.start_queue_runners(sess=monitored_sess, coord=coord)
# train
while not monitored_sess.should_stop():
train_op(monitored_sess, model, x_train, y_train, y_lengths_train)
# when done, ask the threads to stop
coord.request_stop()
# wait for threads to finish
coord.join(threads)
Here is how I'm creating my training hooks:
def construct_training_hooks(summary_op):
hooks = [tf.train.StopAtStepHook(last_step=tf.flags.FLAGS.training_steps),
tf.train.CheckpointSaverHook(checkpoint_dir=tf.flags.FLAGS.log_dir,
saver=tf.train.Saver(),
save_steps=10),
tf.train.SummarySaverHook(output_dir=tf.flags.FLAGS.log_dir,
summary_op=summary_op,
save_steps=10)]
return hooks

tensorflow: ValueError: GraphDef cannot be larger than 2GB

This is the error i got
Traceback (most recent call last):
File "fully_connected_feed.py", line 387, in <module>
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
File "/home/-/.local/lib/python2.7/site-
packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "fully_connected_feed.py", line 289, in main
run_training()
File "fully_connected_feed.py", line 256, in run_training
saver.save(sess, checkpoint_file, global_step=step)
File "/home/-/.local/lib/python2.7/site-
packages/tensorflow/python/training/saver.py", line 1386, in save
self.export_meta_graph(meta_graph_filename)
File "/home/-/.local/lib/python2.7/site-
packages/tensorflow/python/training/saver.py", line 1414, in export_meta_graph
graph_def=ops.get_default_graph().as_graph_def(add_shapes=True),
File "/home/-/.local/lib/python2.7/site-
packages/tensorflow/python/framework/ops.py", line 2257, in as_graph_def
result, _ = self._as_graph_def(from_version, add_shapes)
File "/home/-/.local/lib/python2.7/site-
packages/tensorflow/python/framework/ops.py", line 2220, in _as_graph_def
raise ValueError("GraphDef cannot be larger than 2GB.")
ValueError: GraphDef cannot be larger than 2GB.
I believe it is from the result of this code
weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden1")[0]
weights = tf.scatter_nd_update(weights,indices, updates)
weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden2")[0]
weights = tf.scatter_nd_update(weights,indices, updates)
I am not sure why my model is getting so big in size (15k steps and 240MB). Any thoughts? thanks!
It's hard to say what is happening without seeing the code, but in general TensorFlow model sizes will not increase with number of steps - they should be fixed.
If the model size is increasing with number of steps, it suggests that the computation graph is being added to on every step. For example, something like:
import tensorflow as tf
with tf.Session() as sess:
for i in xrange(1000):
sess.run(tf.add(1, 2))
# or perhaps sess.run(tf.scatter_nd_update(...)) in your case
will create 3000 nodes in the graph (one for add, one for '1' one for '2' on every iteration). Instead, you want to define your computational graph once and run repeatedly with something like:
import tensorflow as tf
x = tf.add(1, 2)
# or perhaps x = tf.scatter_nd_update(...) in your case
with tf.Session() as sess:
for i in xrange(1000):
sess.run(x)
Which will have a fixed graph of 3 nodes for all the 1000 (and any more) iterations. Hope that helps.

Tensorflow OOM after freeze graph

I'm running a seq2seq model with tf, the inference program runs well when loading parameters from checkpoint file using tf.train.Saver. But after exporting the graph with freeze_graph.py (using tf.framework.graph_util.convert_variables_to_constants()), and import with tf.import_graph_def in the inference program, it got OOM problem.
Here is a part of error log:
W tensorflow/core/common_runtime/bfc_allocator.cc:274] ****************************************************************************************************
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 4.0KiB. See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:983] Internal: Dst tensor is not initialized.
E tensorflow/core/common_runtime/executor.cc:594] Executor failed to create kernel. Internal: Dst tensor is not initialized.
[[Node: embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/AttnV_0 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [1024] values: -0.016628871 -0.2054652 -0.045054652...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Traceback (most recent call last):
File "inference.py", line 88, in console_main
result = list(inference(source_sentence))
File "inference.py", line 54, in inference
for sequence in result:
File "/data/experiment/decoder.py", line 115, in search_best_sequence
State.batch_predict(self.session, self.model, self.context, beam)
File "/data/experiment/decoder.py", line 82, in batch_predict
state_list[0].depth)
File "/data/experiment/seq2seq_model.py", line 452, in batch_feed_decoder
log_softmax, attns, state = session.run(output_fetch, input_feed)
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 966, in _run
feed_dict_string, options, run_metadata)
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1016, in _do_run
target_list, options, run_metadata)
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1036, in _do_call
raise type(e)(node_def, op, message)
InternalError: Dst tensor is not initialized.
[[Node: embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/AttnV_0 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [1024] values: -0.016628871 -0.2054652 -0.045054652...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Caused by op u'embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/AttnV_0', defined at:
File "inference.py", line 169, in <module>
tf.app.run()
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "inference.py", line 165, in main
console_main(session)
File "inference.py", line 66, in console_main
model = create_model(session, False)
File "/data/experiment/model.py", line 145, in create_model
tensor_name_pickle=tensor_name_pickle)
File "/data/experiment/seq2seq_model.py", line 106, in __init__
tf.import_graph_def(graph_def, name="")
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/framework/importer.py", line 287, in import_graph_def
op_def=op_def)
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/home/.conda/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
self._traceback = _extract_stack()
InternalError (see above for traceback): Dst tensor is not initialized.
[[Node: embedding_attention_seq2seq/embedding_attention_decoder/attention_decoder/AttnV_0 = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [1024] values: -0.016628871 -0.2054652 -0.045054652...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
I thought it might cause by the memory issue of tf.Constant. Does someone have experience with this problem?
I had the same issue but when trying to load and run the inference from a C++ application using the C API. After a lot of twiddling and testing it appeared the culprit was the frozen graph and freeze_graph.py itself. It's probably a bug of some kind. There are actually multiple issue reports on github's TF repo, but they were just closed due to lack of activity, e.g. here and here. I guess apparent bugs of model freezing aren't of any priority.
In my case the model .pb file was around 500mb and it took around 10Gb of RAM while running a session. Not only did it occupy an insane amount of RAM, it was actually orders of magnitudes slower that way.
When I switched to loading just a SavedModel directory everything went to normal. I'm not sure how to achieve that in python, but for C code I replaced a TF_GraphImportGraphDef() call with TF_LoadSessionFromSavedModel().
I used TF v1.14.0. The library is built with Bazel by me, not the stock version. I could provide some details here and there if anybody was interested. Just not sure where to start, I had many trials and errors.

Does Google Cloud ML support GPU?

I'm testing Google Cloud ML for speeding up my ML model using Tensorflow.
Unfortunately, it seems like Google Cloud ML is extremely slow. My Mainstream-Level PC is at least 10x faster than Google Cloud ML.
I doubt it uses GPU, so I did a test. I modified a sample code to force using GPU.
diff --git a/mnist/trainable/trainer/task.py b/mnist/trainable/trainer/task.py
index 9acb349..a64a11d 100644
--- a/mnist/trainable/trainer/task.py
+++ b/mnist/trainable/trainer/task.py
## -131,11 +131,12 ## def run_training():
images_placeholder, labels_placeholder = placeholder_inputs(
FLAGS.batch_size)
- # Build a Graph that computes predictions from the inference model.
- logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)
+ with tf.device("/gpu:0"):
+ # Build a Graph that computes predictions from the inference model.
+ logits = mnist.inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)
- # Add to the Graph the Ops for loss calculation.
- loss = mnist.loss(logits, labels_placeholder)
+ # Add to the Graph the Ops for loss calculation.
+ loss = mnist.loss(logits, labels_placeholder)
# Add to the Graph the Ops that calculate and apply gradients.
train_op = mnist.training(loss, FLAGS.learning_rate)
This training code works at my PC (gcloud beta ml local train ...) but not in cloud. It gives errors like this:
"Traceback (most recent call last):
File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 239, in <module>
tf.app.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 43, in run
sys.exit(main(sys.argv[:1] + flags_passthrough))
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 235, in main
run_training()
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 177, in run_training
sess.run(init)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 766, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 964, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1014, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1034, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: Cannot assign a device to node 'softmax_linear/biases': Could not satisfy explicit device specification '/device:GPU:0' because no devices matching that specification are registered in this process; available devices: /job:localhost/replica:0/task:0/cpu:0
Colocation Debug Info:
Colocation group had the following types and devices:
ApplyGradientDescent: CPU
Identity: CPU
Assign: CPU
Variable: CPU
[[Node: softmax_linear/biases = Variable[container="", dtype=DT_FLOAT, shape=[10], shared_name="", _device="/device:GPU:0"]()]]
Does Google Cloud ML support GPU?
GPUs are now in Beta and all Cloud ML customers have access.
Here are the docs for using GPUs with Cloud ML.