I have 4 Tesla K80 GPUs in my system. I would like to automatically allocate free GPUs based on an integer input in the code. I am aware of tf.config.experimental.set_visible_devices() to assign specific GPUs but currently do not know how to identify which of the GPUs are in-use (expect manually using nvidia-smi). I am currently changing the code below for every run.
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the first GPU
tf.config.experimental.set_visible_devices(gpus[2:], 'GPU')
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
The above code lets me set the GPUs I want to allocate (GPU 2,3 in above example) for the run. Is there anyway to obtain a list of free (unused) devices to automate the allocation process instead manually having to identify which of the devices should be set?
I am currently using TensorFlow version 1.15
import subprocess, re
import os
import utils
# Nvidia-smi GPU memory parsing.
# Tested on nvidia-smi 370.23
# TF1.15
def run_command(cmd):
"""Run command, return output as string."""
output = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]
return output.decode("ascii")
def list_available_gpus():
"""Returns list of available GPU ids."""
output = run_command("nvidia-smi -L")
# lines of the form GPU 0: TITAN X
gpu_regex = re.compile(r"GPU (?P<gpu_id>\d+):")
result = []
for line in output.strip().split("\n"):
m = gpu_regex.match(line)
assert m, "Couldnt parse "+line
return result
def gpu_memory_map():
"""Returns map of GPU id to memory allocated on that GPU."""
output = run_command("nvidia-smi")
gpu_output = output[output.find("GPU Memory"):]
# lines of the form
# | 0 8734 C python 11705MiB |
memory_regex = re.compile(r"[|]\s+?(?P<gpu_id>\d+)\D+?(?P<pid>\d+).+[ ](?P<gpu_memory>\d+)MiB")
rows = gpu_output.split("\n")
result = {gpu_id: 0 for gpu_id in list_available_gpus()}
for row in gpu_output.split("\n"):
m = memory_regex.search(row)
if not m:
gpu_id = int(m.group("gpu_id"))
gpu_memory = int(m.group("gpu_memory"))
result[gpu_id] += gpu_memory
return result
def pick_gpu_lowest_memory():
"""Returns GPU with the least allocated memory"""
memory_gpu_map = [(memory, gpu_id) for (gpu_id, memory) in gpu_memory_map().items()]
best_memory, best_gpu = sorted(memory_gpu_map)[0]
return best_gpu
def pick_free_gpus(num_gpus=1):
"""Returns free GPUs with the least allocated memory"""
memory_gpu_map = [(memory, gpu_id) for (gpu_id, memory) in gpu_memory_map().items()]
sorted_list = sorted(memory_gpu_map)
gpu_list = []
for i in range(num_gpus):
if sorted_list[i][0] == 0:
print(f'Currently fewer than {num_gpus} GPUs are free right now, choose {i} or fewer GPUs')
return ','.join(map(str, gpu_list))
num_gpus = 2
os.environ["CUDA_VISIBLE_DEVICES"] = pick_free_gpus(num_gpus)
import tensorflow as tf
tf.config.optimizer.set_jit(True) # Enable XLA.
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the first GPU
tf.config.experimental.set_visible_devices(gpus, 'GPU')
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
When I run this code https://github.com/erezposner/Pose2Seg
And I made all steps in this tutorial https://towardsdatascience.com/detection-free-human-instance-segmentation-using-pose2seg-and-pytorch-72f48dc4d23e
but I have this error in cuda:
RuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 4.00 GiB total capacity; 2.57 GiB already allocated; 74.77 MiB free; 2.85 GiB reserved in total by PyTorch) (malloc at ..\c10\cuda\CUDACachingAllocator.cpp:289) (no backtrace available)
How can I solve this?
(base) C:\Users\ASUS\Pose2Seg>python train.py
06-23 07:30:01 ===========> loading model <===========
total params in model is 334, in pretrained model is 336, init 334
06-23 07:30:03 ===========> loading data <===========
loading annotations into memory...
Done (t=4.56s)
creating index...
index created!
06-23 07:30:08 ===========> set optimizer <===========
06-23 07:30:08 ===========> training <===========
C:\Users\ASUS\Anaconda3\Anaconda\lib\site-packages\torch\nn\functional.py:2796: UserWarning: nn.functional.upsample is deprecated. Use nn.functional.interpolate instead.
warnings.warn("nn.functional.upsample is deprecated. Use nn.functional.interpolate instead.")
C:\Users\ASUS\Anaconda3\Anaconda\lib\site-packages\torch\nn\functional.py:2973: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
"See the documentation of nn.Upsample for details.".format(mode))
C:\Users\ASUS\Anaconda3\Anaconda\lib\site-packages\torch\nn\functional.py:3289: UserWarning: Default grid_sample and affine_grid behavior has changed to align_corners=False since 1.3.0. Please specify align_corners=True if the old behavior is desired. See the documentation of grid_sample for details.
warnings.warn("Default grid_sample and affine_grid behavior has changed "
C:\Users\ASUS\Anaconda3\Anaconda\lib\site-packages\torch\nn\functional.py:3226: UserWarning: Default grid_sample and affine_grid behavior has changed to align_corners=False since 1.3.0. Please specify align_corners=True if the old behavior is desired. See the documentation of grid_sample for details.
warnings.warn("Default grid_sample and affine_grid behavior has changed "
06-23 07:30:13 Epoch: [0][0/56599] Lr: [6.68e-05] Time 4.228 (4.228) Data 0.028 (0.028) loss 0.85738 (0.85738)
06-23 07:30:22 Epoch: [0][10/56599] Lr: [6.813333333333334e-05] Time 0.847 (1.280) Data 0.012 (0.051) loss 0.44195 (0.71130)
06-23 07:30:33 Epoch: [0][20/56599] Lr: [6.946666666666667e-05] Time 0.882 (1.180) Data 0.045 (0.037) loss 0.41523 (0.60743)
Traceback (most recent call last):
File "train.py", line 157, in <module>
optimizer, epoch, iteration)
File "train.py", line 74, in train
File "C:\Users\ASUS\Anaconda3\Anaconda\lib\site-packages\torch\tensor.py", line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "C:\Users\ASUS\Anaconda3\Anaconda\lib\site-packages\torch\autograd\__init__.py", line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 4.00 GiB total capacity; 2.57 GiB already allocated; 74.77 MiB free; 2.85 GiB reserved in total by PyTorch) (malloc at ..\c10\cuda\CUDACachingAllocator.cpp:289)
(no backtrace available)
cudatoolkit == 10.1.243
The version of libs:
>>> import tensorflow
2020-06-23 09:45:01.840827: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
>>> tensorflow.__version__
>>> import keras
Using TensorFlow backend.
>>> keras.__version__
>>> import torch
>>> torch.__version__
>>> import torchvision
>>> torchvision.__version__
>>> import pycocotools
train.py code
import os
import sys
import time
import logging
import argparse
import numpy as np
from tqdm import tqdm
import torch
import torch.utils.data
from lib.averageMeter import AverageMeters
from lib.logger import colorlogger
from lib.timer import Timers
from lib.averageMeter import AverageMeters
from lib.torch_utils import adjust_learning_rate
import os
from modeling.build_model import Pose2Seg
from datasets.CocoDatasetInfo import CocoDatasetInfo, annToMask
from test import test
NAME = "release_base"
def setup_logdir():
timestamp = time.strftime("%Y-%m-%d_%H_%M_%S", time.localtime())
LOGDIR = os.path.join(os.getcwd(), 'logs', '%s_%s' % (NAME, timestamp))
SNAPSHOTDIR = os.path.join(
os.getcwd(), 'snapshot', '%s_%s' % (NAME, timestamp))
if not os.path.exists(LOGDIR):
if not os.path.exists(SNAPSHOTDIR):
LOGDIR, SNAPSHOTDIR = setup_logdir()
# Set logging
logger = colorlogger(log_dir=LOGDIR, log_name='train_logs.txt')
# Set Global Timer
timers = Timers()
# Set Global AverageMeter
averMeters = AverageMeters()
def train(model, dataloader, optimizer, epoch, iteration):
# switch to train mode
end = time.time()
for i, inputs in enumerate(dataloader):
averMeters['data_time'].update(time.time() - end)
iteration += 1
lr = adjust_learning_rate(optimizer, iteration, BASE_LR=0.0002,
STEPS=(0, 14150*15, 14150*20), GAMMA=0.1)
# forward
outputs = model(**inputs)
# loss
loss = outputs
# backward
# measure elapsed time
averMeters['batch_time'].update(time.time() - end)
end = time.time()
if i % 10 == 0:
logger.info('Epoch: [{0}][{1}/{2}]\t'
'Lr: [{3}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'loss {loss.val:.5f} ({loss.avg:.5f})\t'
epoch, i, len(dataloader), lr,
batch_time=averMeters['batch_time'], data_time=averMeters['data_time'],
if i % 10000 == 0:
torch.save(model.state_dict(), os.path.join(
SNAPSHOTDIR, '%d_%d.pkl' % (epoch, i)))
torch.save(model.state_dict(), os.path.join(
SNAPSHOTDIR, 'last.pkl'))
return iteration
class Dataset():
def __init__(self):
ImageRoot = r'C:\Users\ASUS\Pose2Seg\data\coco2017\train2017'
AnnoFile = r'C:\Users\ASUS\Pose2Seg\data\coco2017\annotations\person_keypoints_train2017_pose2seg.json'
self.datainfos = CocoDatasetInfo(
ImageRoot, AnnoFile, onlyperson=True, loadimg=True)
def __len__(self):
return len(self.datainfos)
def __getitem__(self, idx):
rawdata = self.datainfos[idx]
img = rawdata['data']
image_id = rawdata['id']
height, width = img.shape[0:2]
gt_kpts = np.float32(rawdata['gt_keypoints']).transpose(
0, 2, 1) # (N, 17, 3)
gt_segms = rawdata['segms']
gt_masks = np.array([annToMask(segm, height, width)
for segm in gt_segms])
return {'img': img, 'kpts': gt_kpts, 'masks': gt_masks}
def collate_fn(self, batch):
batchimgs = [data['img'] for data in batch]
batchkpts = [data['kpts'] for data in batch]
batchmasks = [data['masks'] for data in batch]
return {'batchimgs': batchimgs, 'batchkpts': batchkpts, 'batchmasks': batchmasks}
if __name__ == '__main__':
logger.info('===========> loading model <===========')
model = Pose2Seg().cuda()
# model.init("")
logger.info('===========> loading data <===========')
datasetTrain = Dataset()
dataloaderTrain = torch.utils.data.DataLoader(datasetTrain, batch_size=1, shuffle=True,
num_workers=0, pin_memory=False,
logger.info('===========> set optimizer <===========')
''' set your optimizer like this. Normally is Adam/SGD. '''
#optimizer = torch.optim.SGD(model.parameters(), 0.0002, momentum=0.9, weight_decay=0.0005)
optimizer = torch.optim.Adam(
model.parameters(), 0.0002, weight_decay=0.0000)
iteration = 0
epoch = 0
while iteration < 14150*25:
logger.info('===========> training <===========')
iteration = train(model, dataloaderTrain,
optimizer, epoch, iteration)
epoch += 1
logger.info('===========> testing <===========')
test(model, dataset='cocoVal', logger=logger.info)
test(model, dataset='OCHumanVal', logger=logger.info)
except (KeyboardInterrupt):
logger.info('Save ckpt on exception ...')
torch.save(model.state_dict(), os.path.join(
SNAPSHOTDIR, 'interrupt_%d_%d.pkl' % (epoch, iteration)))
logger.info('Save ckpt done.')
Your GPU doesn't have enough memory. Try to reduce the batch size. If still the same, try to reduce input image size. It should work fine then.
By the way, for this type of model, 8GB of GPU memory is recommended.
def create_hparams():
return trainer_lib.create_hparams(
def create_decode_hparams():
decode_hp = decoding.decode_hparams(FLAGS.decode_hparams)
decode_hp.shards = FLAGS.decode_shards
decode_hp.shard_id = FLAGS.worker_id
decode_in_memory = FLAGS.decode_in_memory or decode_hp.decode_in_memory
decode_hp.decode_in_memory = decode_in_memory
decode_hp.decode_to_file = FLAGS.decode_to_file
decode_hp.decode_reference = FLAGS.decode_reference
return decode_hp
hp = create_hparams()
decode_hp = create_decode_hparams()
run_conf = t2t_trainer.create_run_config(hp)
estimator = trainer_lib.create_estimator(
def input_fn():
inputs = tf.placeholder(tf.int32, shape=(1, None, 1, 1), name="inputs")
input_tensor = {'inputs': inputs }
return tf.estimator.export.ServingInputReceiver(input_tensor, input_tensor)
predictor=tf.contrib.predictor.from_estimator(estimator, input_fn)
I got output of
InvalidArgumentError: Cannot assign a device for operation
transformer/body/parallel_0/body/encoder/layer_0/self_attention/multihead_attention/dot_product_attention/attention: Could not satisfy explicit device specification '/device:GPU:0'
because no supported kernel for GPU devices is available. Colocation
Debug Info: Colocation group had the following types and supported
devices: Root Member(assigned_device_name_index_=-1
requested_device_name_='/device:GPU:0' assigned_device_name_=''
resource_device_name_='' supported_device_types_=[CPU]
possible_devices_=[] ImageSummary: CPU
Colocation members, user-requested devices, and framework assigned
devices, if any:
(ImageSummary) /device:GPU:0
Op: ImageSummary Node attrs: max_images=1, T=DT_FLOAT,
bad_color=Tensor Registered
kernels: device='CPU'
when i print the run_conf.session_config, I got allow_soft_placement: true. Many people said it can solve the problem of InvalidArgumentError but seems not work on me.
I have trained gradient boosted classifier with TF exampled code
TF estimator gradient boosted classifier suddenly stopped while training
I think it takes several steps at begging , than suddenly stopped without any exception print
how can i get any reason why python crash
it 's hard to get the reason why it stopped
lib : TF-gpu 1.13.1
cuda : 10.0
cudnn : 7.5
logs :
2019-04-15 16:40:26.175889: I
tensorflow/core/common_runtime/gpu/gpu_device.cc:1433] Found device 0
with properties: name: GeForce GTX 1060 6GB major: 6 minor: 1
memoryClockRate(GHz): 1.7845 pciBusID: 0000:07:00.0 totalMemory:
6.00GiB freeMemory: 4.97GiB 2019-04-15 16:40:26.182620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1512] Adding visible
gpu devices: 0 2019-04-15 16:40:26.832040: I
tensorflow/core/common_runtime/gpu/gpu_device.cc:984] Device
interconnect StreamExecutor with strength 1 edge matrix: 2019-04-15
16:40:26.835620: I
tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0
2019-04-15 16:40:26.836840: I
tensorflow/core/common_runtime/gpu/gpu_device.cc:1003] 0: N
2019-04-15 16:40:26.838276: I
tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created
TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with
4716 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1060
6GB, pci bus id: 0000:07:00.0, compute capability: 6.1)
checkpoint_exists (from
tensorflow.python.training.checkpoint_management) is deprecated and
will be removed in a future version. Instructions for updating: Use
standard file APIs to check for files with this prefix.
get_checkpoint_mtimes (from
tensorflow.python.training.checkpoint_management) is deprecated and
will be removed in a future version. Instructions for updating: Use
standard file utilities to get mtimes. WARNING:tensorflow:Issue
encountered when serializing resources. Type is unsupported, or the
types of the items don't match field type in CollectionDef. Note this
is a warning and probably safe to ignore. '_Resource' object has no
attribute 'name' WARNING:tensorflow:Issue encountered when serializing
resources. Type is unsupported, or the types of the items don't match
field type in CollectionDef. Note this is a warning and probably safe
to ignore. '_Resource' object has no attribute 'name'
D:\py> (just finished on training)
trn = pd.read_csv('data/santander-customer-transaction-prediction/train.csv')
tst = pd.read_csv('data/santander-customer-transaction-prediction/test.csv')
#trn = upsample(trn[trn.target==0], trn[trn.target==1])
# trn = downsample(trn[trn.target==0], trn[trn.target==1])
features = trn.columns.values[2:202]
target_name = trn.columns.values[1]
NUM_EXAMPLES = len (target)
feat1 = train.corrwith(target).sort_values().head(20).index
feat2 = train.corrwith(target).sort_values().tail(20).index
featonly = feat1.append(feat2)
feat = featonly.append(pd.Index(['target']))
train_origin, tt = train_test_split(trn, test_size=0.2)
train = train_origin[featonly]
target = train_origin[target_name]
test = tst[featonly]
target_name_tst = tst.columns.values[1]
val_train = tt[featonly]
val_target = tt[target_name]
# Training and evaluation input functions.
train_input_fn = make_input_fn(train, target)
val_input_fn = make_input_fn(val_train, val_target)
del train,target,val_train,train_origin,trn,tst
fc = tf.feature_column
feature_columns = []
for feature_name in featonly:
#logging_hook = tf.train.LoggingTensorHook({"loss" : loss, "accuracy" : accuracy}, every_n_iter=10)
params = {
'n_trees': 50,
'max_depth': 3,
'n_batches_per_layer': 1,
# You must enable center_bias = True to get DFCs. This will force the model to
# make an initial prediction before using any features (e.g. use the mean of
# the training labels for regression or log odds for classification when
# using cross entropy loss).
'center_bias': True
# config = tf.estimator.RunConfig().replace(keep_checkpoint_max = 1,
# log_step_count_steps=20, save_checkpoints_steps=20)
est = tf.estimator.BoostedTreesClassifier(feature_columns, **params,model_dir='d:\py/model/')
est.train(train_input_fn, max_steps=50)
metrics = est.evaluate(input_fn=val_input_fn,steps=1)
results = est.predict(input_fn=ttt )
result_list = list(results)
classi = list(map(lambda x : x['classes'][0].decode("utf-8"), result_list))
num = list(range(0,len(classi)))
numi = list(map(lambda x : 'test_' + str(x),num))
#df1 = pd.DataFrame(columns=('ID_code','target'))
df_result = pd.DataFrame({'ID_code' : numi, 'target' : classi})
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
# dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
#if shuffle:
# dataset = dataset.shuffle(NUM_EXAMPLES)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = (dataset.repeat(n_epochs).batch(NUM_EXAMPLES))
return dataset
return input_fn
evaluation result should be shown
I think the problem is caused by GPU memory overflow.
You can try to modify the value of 'n_batches_per_layer' to some bigger value according to you GPU memory size.
I worked with a 6G GPU, the value is 16.
I've been trying to make AI for blackjack using RL. Now I'm trying to make two separate networks which is one way of DQN. I've searched the web and found some way and tried to use it but failed.
This error has occurred:
TypeError: Using a tf.Tensor as a Python bool is not allowed. Use if t is not None: instead of if t: to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
def one_hot(x):
b = s[x[0] * 20 + x[1] * 2 + x[2]]
return b.reshape(1, 600)
def boolstr_to_floatstr(v):
if v == True:
return 1
elif v == False:
return 0
X=tf.placeholder(tf.float32, shape=[1,state_number], name='input_data')
W1=tf.Variable(tf.random_uniform([state_number,128],0,0.01))#network for update
Qpred=tf.matmul(layer2,W3) # Qprediction
W4=tf.Variable(tf.random_uniform([state_number,128],0,0.01))#network for target
target=tf.matmul(layer4,W6) # target
dis=0.99 #discount factor
rList=[] #record the reward
with tf.Session() as sess:
for i in range(num_episodes): #episode 만번
s = env.reset()
rALL = 0
done = False
e=1./((i/100)+1) #exploit or explore용 상수
while not done:
s = np.asarray(s)
s[2] = boolstr_to_floatstr(s[2])
if np.random.rand(1)<e: #새로운 도전시도
a=np.argmax(Qs) #그냥 내가아는한 최댓값의 액션 선택
s1,reward,done,_=env.step(a) #
if done:
Qs[0,a]=reward+dis*np.max(Qs1) #optimal Q
if i%10==0: ##target 을 Qpredion으로 업데이트해줌
if reward==1:
rALL += reward
print('success rate: '+ str(sum(rList)/num_episodes))
print("Final Q-table values")
I need to print success rate finally. before DQN its 38%ish. If there is something wrong in my code considering its DQN algorithm, tell me please.
If you want to share the weights between different networks, then simply create layer with same name, using the scope with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): and then weights between networks will be shared automatically.
I am observing that on my machine tf.matmul in tensorflow is running significantly slower than dot product in numpy. I have GTX 1080 GPU, and expecting tf.matmul to be at least as fast as when running the code using CPU (numpy).
Environment Info
Operating System
lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 16.10
Release: 16.10
Codename: yakkety
Installed version of CUDA and cuDNN:
ls -l /usr/local/cuda-8.0/lib64/libcud*
-rw-r--r-- 1 root root 556000 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudadevrt.a
lrwxrwxrwx 1 root root 16 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart.so -> libcudart.so.8.0
lrwxrwxrwx 1 root root 19 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0 -> libcudart.so.8.0.61
-rwxr-xr-x 1 root root 415432 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart.so.8.0.61
-rw-r--r-- 1 root root 775162 Feb 22 2017 /usr/local/cuda-8.0/lib64/libcudart_static.a
lrwxrwxrwx 1 voldemaro users 13 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn.so -> libcudnn.so.5
lrwxrwxrwx 1 voldemaro users 18 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5 -> libcudnn.so.5.1.10
-rwxr-xr-x 1 voldemaro users 84163560 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn.so.5.1.10
-rw-r--r-- 1 voldemaro users 70364814 Nov 6 2016 /usr/local/cuda-8.0/lib64/libcudnn_static.a
TensorFlow Setup
python -c "import tensorflow; print(tensorflow.__version__)"
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcublas.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcudnn.so.5 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcufft.so.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA library libcurand.so.8.0 locally
Created on Sep 28, 2017
#author: voldemaro
Running on I7/GTX 1080
no MKL
('TF version: ', 'v1.0.0-rc2-15-g47bba63-dirty')
('TF url: ', 'https://github.com/tensorflow/tensorflow/commit/47bba63')
Timing in ms for 2048 x 2048 SVD of type <type 'numpy.float32'> and matmul for 16920 x 2048 of type <type 'numpy.float32'>
numpy default SVD min: 3956.20, median: 4127.75, mean: 4264.41
TF CPU SVD min: 5926.43, median: 5951.70, mean: 5961.43
TF GPU SVD min: 5917.10, median: 6015.87, mean: 6039.63
numpy default .dot product min: 5816.97, median: 5933.43, mean: 5965.22
TF CPU matmul min: 21939.19, median: 22485.99, mean: 22374.69
TF GPU matmul min: 22026.52, median: 22109.97, mean: 22199.43
from scipy import linalg; # for svd
import numpy as np;
import os;
import sys;
import time;
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" # nospam
import tensorflow as tf;
import gc; gc.disable();
dtype = np.float32;
M = 16920;
def get_tensorflow_version_url():
import tensorflow as tf
commit = tf.__git_version__
# commit looks like this
# 'v1.0.0-65-g4763edf-dirty'
commit = commit.replace("'","")
if commit.endswith('-dirty'):
dirty = True
commit = commit[:-len('-dirty')]
commit=commit.rsplit('-g', 1)[1]
url = 'https://github.com/tensorflow/tensorflow/commit/'+commit
return url
def get_mkl_version():
import ctypes
import numpy as np
ver = np.zeros(199, dtype=np.uint8)
mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
mkl.MKL_Get_Version_String(ver.ctypes.data_as(ctypes.c_char_p), 198)
return ver[ver != 0].tostring()
timeline_counter = 0
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE);
def benchmark(message, func):
time_list = []
for i in range(NUM_RUNS):
start_time = time.time();
time_list = 1000*np.array(time_list); # get seconds, convert to ms
if len(time_list)>0:
min = np.min(time_list);
median = np.median(time_list);
formatted = ["%.2f"%(d,) for d in time_list[:10]];
result = "min: %8.2f, median: %8.2f, mean: %8.2f"%(min, median, np.mean(time_list))
result = "empty"
print("%-20s %s"%(message, result))
if np.__config__.get_info("lapack_mkl_info"):
print("MKL version", get_mkl_version())
print("no MKL")
print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())
svd_array = np.random.random_sample((N,N)).astype(dtype);
another_array = np.random.random_sample((M,N)).astype(dtype);
init_OP = tf.global_variables_initializer();
with tf.device("/gpu:0"):
init_holder_gpu = tf.placeholder(dtype, shape=(M,M));
specVarGPU = tf.random_uniform((N,N), dtype=dtype);
S_gpu = tf.random_uniform((M,N), dtype=dtype);
V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu));
[D2_gpu, E1_gpu, E2_gpu] = tf.svd(specVarGPU);
with tf.device("/cpu:0"):
init_holder_cpu = tf.placeholder(dtype, shape=(M,M));
specVarCPU = tf.random_uniform((N,N), dtype=dtype);
S_cpu = tf.random_uniform((M,N), dtype=dtype);
V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu));
[D2_cpu, E1_cpu, E2_cpu] = tf.svd(specVarCPU);
V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), E1_cpu), tf.transpose(S_cpu));
print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype));
def func(): linalg.svd(svd_array)
benchmark("numpy default SVD", func)
config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)));
sess = tf.Session(config = config);
def func2(): sess.run([D2_cpu.op, E1_cpu.op, E2_cpu.op]);
benchmark("TF CPU SVD", func2);
def func3(): sess.run([D2_gpu.op, E1_gpu.op, E2_gpu.op]);
benchmark("TF GPU SVD", func3);
def func1(): np.transpose(np.asmatrix(another_array)).getH().dot(svd_array).dot(np.transpose(another_array));
benchmark("numpy default .dot product", func1)
def func4(): sess.run([V_cpu]);
benchmark("TF CPU matmul", func4)
def func5(): sess.run([V_gpu])
benchmark("TF GPU matmul", func4)
Apparently tensorflow does not optimize "nested" operations, so
tf.matmul(tf.transpose(tf.conj(a)), x) takes significantly longer time than b = tf.conj(a), c = tf.transpose(b), and d = tf.matmul(c, x).
For SVD, the problem is that there is no GPU Kernel for SVD yet. See here: https://github.com/tensorflow/tensorflow/issues/11588
This means that SVD has to be computed on the CPU, even if the tensors are instantiated on the GPU. For this reason, there's an overhead for transferring data from the GPU to the CPU for computation, then back to the GPU for storing results.
For matmul on the GPU the problem is in the last line of your bechmarking code: you are not calling func5 but func4 again, so you are benchmarking the TF CPU matmul.
Aside from this, there are a few other things you may want to check in your code:
there is no need for the init_holder_cpu and init_holder_gpu vars, as you don't use them
there is no need to run the global_variables_initializer, as there are no variables
you are redefining V_cpu, using one of the outputs from SVD, so you are effectively running both SVD and the matmul in your test
A slightly cleaned up version of the code looks like:
# ... above is the same
print("TF version: ", tf.__git_version__)
print("TF url: ", get_tensorflow_version_url())
svd_array = np.random.random_sample((N,N)).astype(dtype)
another_array = np.random.random_sample((M,N)).astype(dtype)
with tf.device("/gpu:0"):
specVarGPU = tf.random_uniform((N, N), dtype=dtype)
S_gpu = tf.random_uniform((M, N), dtype=dtype)
V_gpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_gpu))), specVarGPU, ), tf.transpose(S_gpu))
D2_gpu, E1_gpu, E2_gpu = tf.svd(specVarGPU)
with tf.device("/cpu:0"):
specVarCPU = tf.random_uniform((N,N), dtype=dtype)
S_cpu = tf.random_uniform((M,N), dtype=dtype)
V_cpu = tf.matmul(tf.matmul(tf.transpose(tf.transpose(tf.conj(S_cpu))), specVarCPU, ), tf.transpose(S_cpu))
D2_cpu, E1_cpu, E2_cpu = tf.svd(specVarCPU)
config = tf.ConfigProto(allow_soft_placement = True, graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)))
def V_numpy():
np.matmul(np.matmul(np.transpose(np.transpose(np.conj(another_array))), svd_array, ), np.transpose(another_array))
with tf.Session(config = config) as sess:
print("Timing in ms for %d x %d SVD of type %s and matmul for %d x %d of type %s"%(N, N, dtype, M, N, dtype))
benchmark("numpy default SVD", lambda: linalg.svd(svd_array))
benchmark("TF CPU SVD", lambda: sess.run([D2_cpu.op, E1_cpu.op, E2_cpu.op]))
benchmark("TF GPU SVD", lambda: sess.run([D2_gpu.op, E1_gpu.op, E2_gpu.op]))
benchmark("numpy MKL matmul", V_numpy)
benchmark("TF CPU matmul", lambda: sess.run([V_cpu.op]))
benchmark("TF GPU matmul", lambda: sess.run([V_gpu.op]))
And outputs (ona an i7 and GTX 1070)
MKL version b'Intel(R) Math Kernel Library Version 2017.0.3 Product Build 20170413 for Intel(R) 64 architecture applications'
TF version: v1.4.0-rc1-11-g130a514
TF url: https://github.com/tensorflow/tensorflow/commit/130a514
Timing in ms for 2048 x 2048 SVD of type <class 'numpy.float32'> and matmul for 16920 x 2048 of type <class 'numpy.float32'>
numpy default SVD min: 3318.42, median: 3320.40, mean: 3320.40
TF CPU SVD min: 4576.71, median: 4577.02, mean: 4577.02
TF GPU SVD min: 14022.59, median: 14172.69, mean: 14172.69
numpy MKL matmul min: 4500.33, median: 4628.01, mean: 4628.01
TF CPU matmul min: 15420.19, median: 15664.84, mean: 15664.84
TF GPU matmul min: 277.80, median: 282.54, mean: 282.54
You can see that the GPU version of matmul is much faster than any CPU implementation, as expected.