PyTorch Object Detection with GPU on Ubuntu 18.04 - RuntimeError: CUDA out of memory. Tried to allocate xx.xx MiB - gpu

I'm attempting to get this PyTorch person detection example:
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
running locally with a GPU, either in a Jupyter Notebook or a regular python file. I get the error in the title either way.
I'm using Ubuntu 18.04. Here is a summary of the steps I've performed:
1) Stock Ubuntu 18.04 install on a Lenovo ThinkPad X1 Extreme Gen 2 with a GTX 1650 GPU.
2) Perform a standard CUDA 10.0 / cuDNN 7.4 install. I'd rather not restate all the steps as this post is going to be more than long enough already. This is a standard procedure, pretty much any link found via googling is what I followed.
3) Install torch and torchvision
pip3 install torch torchvision
4) From this link on the PyTorch site:
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
I've both saved the linked notebook:
https://colab.research.google.com/github/pytorch/vision/blob/temp-tutorial/tutorials/torchvision_finetuning_instance_segmentation.ipynb
And Also tried the link at the bottom that has the regular Python file:
https://pytorch.org/tutorials/_static/tv-training-code.py
5) Before running either the notebook or the regular Python way, I did the following (found at the top of the above linked notebook):
Install the CoCo API into Python:
cd ~
git clone https://github.com/cocodataset/cocoapi.git
cd cocoapi/PythonAPI
open Makefile in gedit, change the two instances of "python" to "python3", then:
python3 setup.py build_ext --inplace
sudo python3 setup.py install
Get the necessary files the above linked files need to run:
cd ~
git clone https://github.com/pytorch/vision.git
cd vision
git checkout v0.5.0
from ~/vision/references/detection, copy coco_eval.py, coco_utils.py, engine.py, transforms.py, and utils.py to whichever directory the above linked notebook or tv-training-code.py file are being ran from.
6) Download the Penn Fudan Pedestrian dataset from the link on the above page:
https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip
then unzip and put in the same directory as the notebook or tv-training-code.py
In case the above link ever breaks or just for easier reference, here is tv-training-code.py as I have downloaded it at this time:
# Sample code from the TorchVision 0.3 Object Detection Finetuning Tutorial
# http://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
import os
import numpy as np
import torch
from PIL import Image
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from engine import train_one_epoch, evaluate
import utils
import transforms as T
class PennFudanDataset(object):
def __init__(self, root, transforms):
self.root = root
self.transforms = transforms
# load all image files, sorting them to
# ensure that they are aligned
self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
def __getitem__(self, idx):
# load images ad masks
img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
mask = np.array(mask)
# instances are encoded as different colors
obj_ids = np.unique(mask)
# first id is the background, so remove it
obj_ids = obj_ids[1:]
# split the color-encoded mask into a set
# of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# our dataset has two classes only - background and person
num_classes = 2
# use our dataset and defined transformations
dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))
# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=2, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
# get the model using our helper function
model = get_model_instance_segmentation(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
# let's train it for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
evaluate(model, data_loader_test, device=device)
print("That's it!")
if __name__ == "__main__":
main()
Here is an exmaple run of tv-training-code.py
$ python3 tv-training-code.py
Epoch: [0] [ 0/60] eta: 0:01:17 lr: 0.000090 loss: 4.1717 (4.1717) loss_classifier: 0.8903 (0.8903) loss_box_reg: 0.1379 (0.1379) loss_mask: 3.0632 (3.0632) loss_objectness: 0.0700 (0.0700) loss_rpn_box_reg: 0.0104 (0.0104) time: 1.2864 data: 0.1173 max mem: 1865
Traceback (most recent call last):
File "tv-training-code.py", line 165, in <module>
main()
File "tv-training-code.py", line 156, in main
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
File "/xxx/PennFudanExample/engine.py", line 46, in train_one_epoch
losses.backward()
File "/usr/local/lib/python3.6/dist-packages/torch/tensor.py", line 166, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
File "/usr/local/lib/python3.6/dist-packages/torch/autograd/function.py", line 77, in apply
return self._forward_cls.backward(self, *args)
File "/usr/local/lib/python3.6/dist-packages/torch/autograd/function.py", line 189, in wrapper
outputs = fn(ctx, *args)
File "/usr/local/lib/python3.6/dist-packages/torchvision/ops/roi_align.py", line 38, in backward
output_size[0], output_size[1], bs, ch, h, w, sampling_ratio)
RuntimeError: CUDA out of memory. Tried to allocate 132.00 MiB (GPU 0; 3.81 GiB total capacity; 2.36 GiB already allocated; 132.69 MiB free; 310.59 MiB cached) (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:267)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x33 (0x7fdfb6c9b813 in /usr/local/lib/python3.6/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1ce68 (0x7fdfb6edce68 in /usr/local/lib/python3.6/dist-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1de6e (0x7fdfb6edde6e in /usr/local/lib/python3.6/dist-packages/torch/lib/libc10_cuda.so)
frame #3: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x279 (0x7fdf59472789 in /usr/local/lib/python3.6/dist-packages/torch/lib/libtorch.so)
[many more frame lines omitted]
Clearly the line:
RuntimeError: CUDA out of memory. Tried to allocate 132.00 MiB (GPU 0; 3.81 GiB total capacity; 2.36 GiB already allocated; 132.69 MiB free; 310.59 MiB cached) (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:267)
is the critical error.
If I run an nvidia-smi before a run:
$ nvidia-smi
Tue Dec 24 14:32:49 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44 Driver Version: 440.44 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 1650 Off | 00000000:01:00.0 On | N/A |
| N/A 47C P8 5W / N/A | 296MiB / 3903MiB | 3% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 1190 G /usr/lib/xorg/Xorg 142MiB |
| 0 1830 G /usr/bin/gnome-shell 72MiB |
| 0 3711 G ...uest-channel-token=14371934934688572948 78MiB |
+-----------------------------------------------------------------------------+
It seems pretty clear there is plenty of GPU memory available (this GPU is 4GB).
Moreover, I'm confident my CUDA/cuDNN install and GPU hardware are good b/c I train and inference the TensorFlow object detection API on this computer frequently, and as long as I use the allow_growth option I never have GPU related errors.
From Googling on this error it seems to be relatively common. The most common solutions are:
1) Try a smaller batch size (not really applicable in this case since the training and testing batch sizes are 2 and 1 respectively, and I tried with 1 and 1 and still got the same error)
2) Update to the latest version of PyTorch (but I'm already at the latest version).
Some other suggestions involve reworking the training script. I'm very familiar with TensorFlow but I'm new to PyTorch so I'm not sure how to go about that. Also, most of the rework suggestions I can find for this error do not pertain to object detection and therefore I'm not able to relate them to this training script specifically.
Has anybody else gotten this script to run locally with an NVIDIA GPU? Do you suspect a OS/CUDA/PyTorch configuration concern, or is there someway the script can be reworked to prevent this error? Any assistance would be greatly appreciated.

Very strange, after changing both the training and testing batch size to 1, it now does not crash with a GPU error. Very strange since I'm certain I tried this before.
Perhaps it had something to do with changing the batch size to 1 for both training and testing, and then rebooting or somehow refreshing something else? I'm not really sure. Very odd.
Now the evaluate function call is crashing with the error:
object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
But it seems this is completely unrelated so I'll make a separate post for that.

Related

SageMaker fails when using Multi-GPU with keras.utils.multi_gpu_model

Running AWS SageMaker with a custom model, the TrainingJob fails with an Algorithm Error when using Keras plus a Tensorflow backend in multi-gpu configuration:
from keras.utils import multi_gpu_model
parallel_model = multi_gpu_model(model, gpus=K)
parallel_model.compile(loss='categorical_crossentropy',
optimizer='rmsprop')
parallel_model.fit(x, y, epochs=20, batch_size=256)
This simple parallel model loading will fail. There is no further error or exception from CloudWatch logging. This configuration works properly on local machine with 2x NVIDIA GTX 1080, same Keras Tensorflow backend.
According to SageMaker documentation and tutorials the multi_gpu_model utility will work ok when Keras backend is MXNet, but I did not find any mention when the backend is Tensorflow with the same multi gpu configuration.
[UPDATE]
I have updated the code with the suggested answer below, and I'm adding some logging before the TrainingJob hangs
This logging repeats twice
2018-11-27 10:02:49.878414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1, 2, 3
2018-11-27 10:02:49.878462: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-11-27 10:02:49.878471: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 1 2 3
2018-11-27 10:02:49.878477: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N Y Y Y
2018-11-27 10:02:49.878481: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1: Y N Y Y
2018-11-27 10:02:49.878486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 2: Y Y N Y
2018-11-27 10:02:49.878492: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 3: Y Y Y N
2018-11-27 10:02:49.879340: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:0 with 14874 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1b.0, compute capability: 7.0)
2018-11-27 10:02:49.879486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:1 with 14874 MB memory) -> physical GPU (device: 1, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1c.0, compute capability: 7.0)
2018-11-27 10:02:49.879694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:2 with 14874 MB memory) -> physical GPU (device: 2, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1d.0, compute capability: 7.0)
2018-11-27 10:02:49.879872: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:3 with 14874 MB memory) -> physical GPU (device: 3, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0)
Before there is some logging info about each GPU, that repeats 4 times
2018-11-27 10:02:46.447639: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 3 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0000:00:1e.0
totalMemory: 15.78GiB freeMemory: 15.37GiB
According to the logging all the 4 GPUs are visible and loaded in the Tensorflow Keras backend. After that no application logging follows, the TrainingJob status is inProgress for a while, after that it becomes Failed with the same Algorithm Error.
Looking at CloudWatch logging I can see some metrics at work. Specifically GPU Memory Utilization, CPU Utilization are ok, while GPU utilization is 0%.
[UPDATE]
Due to a known bug on Keras that is about saving a multi gpu model, I'm using this override of the multi_gpu_model utility in keras.utils
from keras.layers import Lambda, concatenate
from keras import Model
import tensorflow as tf
def multi_gpu_model(model, gpus):
#source: https://github.com/keras-team/keras/issues/8123#issuecomment-354857044
if isinstance(gpus, (list, tuple)):
num_gpus = len(gpus)
target_gpu_ids = gpus
else:
num_gpus = gpus
target_gpu_ids = range(num_gpus)
def get_slice(data, i, parts):
shape = tf.shape(data)
batch_size = shape[:1]
input_shape = shape[1:]
step = batch_size // parts
if i == num_gpus - 1:
size = batch_size - step * i
else:
size = step
size = tf.concat([size, input_shape], axis=0)
stride = tf.concat([step, input_shape * 0], axis=0)
start = stride * i
return tf.slice(data, start, size)
all_outputs = []
for i in range(len(model.outputs)):
all_outputs.append([])
# Place a copy of the model on each GPU,
# each getting a slice of the inputs.
for i, gpu_id in enumerate(target_gpu_ids):
with tf.device('/gpu:%d' % gpu_id):
with tf.name_scope('replica_%d' % gpu_id):
inputs = []
# Retrieve a slice of the input.
for x in model.inputs:
input_shape = tuple(x.get_shape().as_list())[1:]
slice_i = Lambda(get_slice,
output_shape=input_shape,
arguments={'i': i,
'parts': num_gpus})(x)
inputs.append(slice_i)
# Apply model on slice
# (creating a model replica on the target device).
outputs = model(inputs)
if not isinstance(outputs, list):
outputs = [outputs]
# Save the outputs for merging back together later.
for o in range(len(outputs)):
all_outputs[o].append(outputs[o])
# Merge outputs on CPU.
with tf.device('/cpu:0'):
merged = []
for name, outputs in zip(model.output_names, all_outputs):
merged.append(concatenate(outputs,
axis=0, name=name))
return Model(model.inputs, merged)
This works ok on local 2x NVIDIA GTX 1080 / Intel Xeon / Ubuntu 16.04. It will fails on SageMaker Training Job.
I have posted this issue on AWS Sagemaker forum in
TrainingJob custom algorithm with Keras backend and multi GPU
SageMaker Fails when using Multi-GPU with
keras.utils.multi_gpu_model
[UPDATE]
I have slightly modified the tf.session code adding some initializers
with tf.Session() as session:
K.set_session(session)
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
and now at least I can see that one GPU (I assume device gpu:0) is used from the instance metrics. The multi-gpu does not work anyways.
This might not be the best answer for your problem, but this is what I am using for a multi-gpu model with Tensorflow backend. First i initialize using:
def setup_multi_gpus():
"""
Setup multi GPU usage
Example usage:
model = Sequential()
...
multi_model = multi_gpu_model(model, gpus=num_gpu)
multi_model.fit()
About memory usage:
https://stackoverflow.com/questions/34199233/how-to-prevent-tensorflow-from-allocating-the-totality-of-a-gpu-memory
"""
import tensorflow as tf
from keras.utils.training_utils import multi_gpu_model
from tensorflow.python.client import device_lib
# IMPORTANT: Tells tf to not occupy a specific amount of memory
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU
sess = tf.Session(config=config)
set_session(sess) # set this TensorFlow session as the default session for Keras.
# getting the number of GPUs
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
num_gpu = len(get_available_gpus())
print('Amount of GPUs available: %s' % num_gpu)
return num_gpu
Then i call
# Setup multi GPU usage
num_gpu = setup_multi_gpus()
and create a model.
...
After which you're able to make it a multi GPU model.
multi_model = multi_gpu_model(model, gpus=num_gpu)
multi_model.compile...
multi_model.fit...
The only thing here that is different from what you are doing is the way Tensorflow is initializing the GPU's. I can't imagine it being the problem, but it might be worth trying out.
Good luck!
Edit: I noticed sequence to sequence not being able to work with multi GPU. Is that the type of model you are trying to train?
I apologize for the slow response.
It seems there are a lot of threads that are running in parallel, and I want to link them together, so that other individuals who have the same issue can see the progress and discussion going on.
https://forums.aws.amazon.com/thread.jspa?messageID=881541
https://forums.aws.amazon.com/thread.jspa?messageID=881540
https://github.com/aws/sagemaker-python-sdk/issues/512
There a few questions in regards to this.
What version of TensorFlow and Keras?
I am not too sure what is causing this problem. Does your container have all of the needed dependencies such as CUDA and etc? https://www.tensorflow.org/install/gpu
Were you able to train using single GPU with Keras?

google colaboratory `ResourceExhaustedError` with GPU

I'm trying to fine-tune a Vgg16 model using colaboratory but I ran into this error when training with the GPU.
OOM when allocating tensor of shape [7,7,512,4096]
INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.ResourceExhaustedError'>, OOM when allocating tensor of shape [7,7,512,4096] and type float
[[Node: vgg_16/fc6/weights/Momentum/Initializer/zeros = Const[_class=["loc:#vgg_16/fc6/weights"], dtype=DT_FLOAT, value=Tensor<type: float shape: [7,7,512,4096] values: [[[0 0 0]]]...>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
Caused by op 'vgg_16/fc6/weights/Momentum/Initializer/zeros', defined at:
also have this output for my vm session:
--- colab vm info ---
python v=3.6.3
tensorflow v=1.4.1
tf device=/device:GPU:0
model name : Intel(R) Xeon(R) CPU # 2.20GHz
model name : Intel(R) Xeon(R) CPU # 2.20GHz
MemTotal: 13341960 kB
MemFree: 1541740 kB
MemAvailable: 10035212 kB
My tfrecord is just 118 256x256 JPGs with file size <2MB
Is there a workaround? it works when I use the CPU, just not the GPU
Seeing a small amount of free GPU memory almost always indicates that you've created a TensorFlow session without the allow_growth = True option. See:
https://www.tensorflow.org/guide/using_gpu#allowing_gpu_memory_growth
If you don't set this option, by default, TensorFlow will reserve nearly all GPU memory when a session is created.
Good news: As of this week, Colab now sets this option by default, so you should see much lower growth as you use multiple notebooks on Colab. And, you can also inspect GPU memory usage per notebook by selecting 'Manage session's from the runtime menu.
Once selected, you'll see a dialog that lists all notebooks and the GPU memory each is consuming. To free memory, you can terminate runtimes from this dialog as well.
I met the same issue, and I found my problem was caused by the code below:
from tensorflow.python.framework.test_util import is_gpu_available as tf
if tf()==True:
device='/gpu:0'
else:
device='/cpu:0'
I used below Code to check the GPU memory usage status and find the usage is 0% before running the code above, and it became 95% after.
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn't guaranteed
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " I Proc size: " + humanize.naturalsize( process.memory_info().rss))
print('GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB'.format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
Before:
Gen RAM Free: 12.7 GB I Proc size: 139.1 MB
GPU RAM Free: 11438MB | Used: 1MB | Util 0% | Total 11439MB
After:
Gen RAM Free: 12.0 GB I Proc size: 1.0 GB
GPU RAM Free: 564MB | Used: 10875MB | Util 95% | Total 11439MB
Somehow, is_gpu_available() managed consume most of the GPU memory without release them after, so instead, I used below code to detect the gpu status for me, problem solved
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
try:
import GPUtil as GPU
GPUs = GPU.getGPUs()
device='/gpu:0'
except:
device='/cpu:0'
I failed to repro the originally-reported error, but if that is caused by running out of GPU memory (as opposed to main memory) this might help:
# See https://www.tensorflow.org/tutorials/using_gpu#allowing_gpu_memory_growth
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
and then pass session_config=config to e.g. slim.learning.train() (or whatever session ctor you end up using).
In my case I didn't solve with solution provided by Ami, even if it's excellent, probably because Colaboratory VM couldn't furnish more resources.
I had the OOM error in detection phase (not model training). I solved with a workaround, disabling GPU for detection:
config = tf.ConfigProto(device_count = {'GPU': 0})
sess = tf.Session(config=config)

Tensorboard - Profiling compute time - everything is a "unused substructure" in runtime statistics

Tensorboard provides runtime statistics that allow profiling of memory consumption and compute time (see Docu).
However in tensorflow v1.2.1 some of my ops were shown dashed & in orange as "unused substructure" and did not provide any information at all - no device, nor memory, nor compute time.
With the update to tensorflow v1.3 this even got worse. Now everything is a orange dashed "unused substructure"
I tried this on various bigger tensorflow projects that I need to optimize as well as on working colleagues PCs.
Am I doing something wrong or is this a bug in tensorflow / tensorboard?
Here is an minimalistic example code:
import tensorflow as tf
from tensorflow.python.client import timeline
sess = tf.InteractiveSession()
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
# create some dummy Ops for the graph
C1 = tf.constant(5)
C2 = tf.constant(3)
myOp = C1*C2 + tf.square(C2)
res = sess.run([myOp], options=run_options,run_metadata = run_metadata)
writer = tf.summary.FileWriter(logdir='tensorboard/profile_bug',graph=sess.graph)
print (res)
tl = timeline.Timeline(run_metadata.step_stats)
ctf = tl.generate_chrome_trace_format()
with open('tensorboard/timelineOfBug.json', 'w') as f:
f.write(ctf)
writer.add_run_metadata(run_metadata,"mySess")
writer.close()
sess.close()
This is an issue with some new versions of TF (seems like 1.2 and 1.3). Updating to 1.4 fixes it.

Cntk can not detect my gpu by cntk.all_devices()

Cntk only detect 1 device(my cpu) by calling cntk.all_devices(). However I do have a gpu on my computer. By running the tutorial supported by cntk, I could get some info:
-------------------------------------------------------------------
-------------------------------------------------------------------
GPU info:
Device[0]: cores = 48; computeCapability = 2.1; type = "NVS 310"; memory = 512 MB
-------------------------------------------------------------------
##############################################################################
# #
# Train command (train action) #
# #
##############################################################################
Model has 9 nodes. Using CPU.
As an consequence, I can not use my gpu as by calling set_default_device(gpu(0)). How could I solve this problem?
The minimum GPU compute capability for CNTK is 3.0. (Edit: The fact that you can run the tutorial using cntk.exe indicates a bug somewhere in the v1 executable.). When you run the tutorial with cntk.exe, it prints out the GPU info, but still ends up using CPU: Model has 9 nodes. Using CPU.
The only way to solve this problem is to change the value of the constant MininumCCMajorForGpu in BestGpu.cpp and recompile.

How to set specific gpu in tensorflow?

I want to specify the gpu to run my process. And I set it as follows:
import tensorflow as tf
with tf.device('/gpu:0'):
a = tf.constant(3.0)
with tf.Session() as sess:
while True:
print sess.run(a)
However it still allocate memory in both my two gpus.
| 0 7479 C python 5437MiB
| 1 7479 C python 5437MiB
There are 3 ways to achieve this:
Using CUDA_VISIBLE_DEVICES environment variable.
by setting environment variable CUDA_VISIBLE_DEVICES="1" makes only device 1 visible and by setting CUDA_VISIBLE_DEVICES="0,1" makes devices 0 and 1 visible. You can do this in python by having a line os.environ["CUDA_VISIBLE_DEVICES"]="0,1" after importing os package.
Using with tf.device('/gpu:2') and creating the graph. Then it will use GPU device 2 to run.
Using config = tf.ConfigProto(device_count = {'GPU': 1}) and then sess = tf.Session(config=config). This will use GPU device 1.
TF would allocate all available memory on each visible GPU if not told otherwise. Here are 5 ways to stick to just one (or a few) GPUs.
Bash solution. Set CUDA_VISIBLE_DEVICES=0,1 in your terminal/console before starting python or jupyter notebook:
CUDA_VISIBLE_DEVICES=0,1 python script.py
Python solution. run next 2 lines of code before constructing a session
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
Automated solution. Method below will automatically detect GPU devices that are not used by other scripts and set CUDA_VISIBLE_DEVICES for you. You have to call mask_unused_gpus before constructing a session. It will filter out GPUs by current memory usage. This way you can run multiple instances of your script at once without changing your code or setting console parameters.
The function:
import subprocess as sp
import os
def mask_unused_gpus(leave_unmasked=1):
ACCEPTABLE_AVAILABLE_MEMORY = 1024
COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
try:
_output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
available_gpus = [i for i, x in enumerate(memory_free_values) if x > ACCEPTABLE_AVAILABLE_MEMORY]
if len(available_gpus) < leave_unmasked: raise ValueError('Found only %d usable GPUs in the system' % len(available_gpus))
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, available_gpus[:leave_unmasked]))
except Exception as e:
print('"nvidia-smi" is probably not installed. GPUs are not masked', e)
mask_unused_gpus(2)
Limitations: if you start multiple scripts at once it might cause a collision, because memory is not allocated immediately when you construct a session. In case it is a problem for you, you can use a randomized version as in original source code: mask_busy_gpus()
Tensorflow 2.0 suggest yet another method:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the first GPU
try:
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
except RuntimeError as e:
# Visible devices must be set at program startup
print(e)
Tensorflow/Keras also allows to specify gpu to be used with session config. I can recommend it only if setting environment variable is not an options (i.e. an MPI run). Because it tend to be the least reliable of all methods, especially with keras.
config = tf.ConfigProto()
config.gpu_options.visible_device_list = "0,1"
with tf.Session(config) as sess:
#or K.set_session(tf.Session(config))
I believe that you need to set CUDA_VISIBLE_DEVICES=1. Or which ever GPU you want to use. If you make only one GPU visible, you will refer to it as /gpu:0 in tensorflow regardless of what you set the environment variable to.
More info on that environment variable: https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/
You can modify the GPU options settings by adding at the begining of your python script:
gpu_options = tf.GPUOptions(visible_device_list="0")
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
"0" is here the name of the GPU you want to use. You can have the list of the GPU available by typing the command nvidia-smi in the terminal prompt.
With Keras, these 2 functions allow the selection of CPU or GPU and in the case of GPU the fraction of memory that will be used.
import os
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
def set_cpu_option():
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["CUDA_VISIBLE_DEVICES"] = ""
def set_gpu_option(which_gpu, fraction_memory):
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = fraction_memory
config.gpu_options.visible_device_list = which_gpu
set_session(tf.Session(config=config))
return
set_gpu_option("0", 0.9)
# or
set_cpu_option()
def set_specific_gpu(ID):
gpus_all_physical_list = tf.config.list_physical_devices(device_type='GPU')
tf.config.set_visible_devices(gpus_all_physical_list[ID], 'GPU')
refer to https://www.tensorflow.org/api_docs/python/tf/config/list_physical_devices
import tensorflow as tf
gpu_number = 2 #### GPU number
gpus = tf.config.list_physical_devices('GPU')
if gpus:
tf.config.experimental.set_visible_devices(gpus[gpu_number], 'GPU')
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
The most elegant and clean way I have seen this work for me on my multi-core gpu setup is:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
tf_device='/gpu:0'
This assigns the task to gpu device 1.
Similarly, doing something on the lines:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
tf_device='/gpu:0'
The os.environ command can be seen as a way of making only that GPU device exposed on which you intend to run the code. The second command just picks the first of the available devices that you specified.
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"
The only thing which worked for me cleanly from within Processes to assign specific GPU to each process in a Pool.
TF 2.9 and above version of tensorflow has changed the APIs, so updating for the same,
gpus = tf.config.list_physical_devices('GPU')
gpu_id = 0
if gpus:
# Restrict TensorFlow to only use only one GPU based on gpu_id
try:
tf.config.set_visible_devices(gpus[gpu_id], 'GPU')
logical_gpus = tf.config.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
print(e)