Failed to get convolution algorithm Tensorflow 2.3.0 - tensorflow

I recently finished the Image super-resolution using Autoencoders in Coursera and when I try to run the same code on my laptop using Spyder and Jupyter notebook, I keep getting this error.
I am using Nvidia GeForce 1650Ti along with Tensorflow-gpu=2.3.0, CUDA=10.1, cuDNN=7.6.5 and python=3.8.5. I have used the same configurations for running many deep neural network problems and none of them gave this error.
Code:
# Image Super Resolution using Autoencoder
# Loading the Images
x_train_n = []
x_train_down = []
x_train_n2 = []
x_train_down2 = []
import tensorflow as tf
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction = 0.95)
session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
path = 'D:/GPU testing/Image Super Resolution/data/cars_train/'
images = os.listdir(path)
size = 0
for a in images:
try:
img = image.load_img(str(path+a), target_size=(64,64,3))
img_1 = image.img_to_array(img)
img_1 = img_1/255.
x_train_n.append(img_1)
dwn2 = rescale(rescale(img_1, 0.5, multichannel=True),
2.0, multichannel=True)
img_2 = image.img_to_array(dwn2)
x_train_down.append(img_2)
size+= 1
except:
print("Error loading image")
size += 1
if size >= 64:
break
x_train_n2 = np.array(x_train_n)
print(x_train_n2.shape)
x_train_down2 = np.array(x_train_down)
print(x_train_down2.shape)
# Building a Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Conv2DTranspose, UpSampling2D, add
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
# Building the Encoder
input_img = Input(shape=(64, 64, 3))
l1 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(input_img)
l2 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l1)
l3 = MaxPooling2D(padding='same')(l2)
l3 = Dropout(0.3)(l3)
l4 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l3)
l5 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l4)
l6 = MaxPooling2D(padding='same')(l5)
l7 = Conv2D(256, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l6)
# Building the Decoder
l8 = UpSampling2D()(l7)
l9 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l8)
l10 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l9)
l11 = add([l5, l10])
l12 = UpSampling2D()(l11)
l13 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l12)
l14 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l13)
l15 = add([l14, l2])
# chan = 3, for RGB
decoded = Conv2D(3, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l15)
# Create our network
autoencoder = Model(input_img, decoded)
autoencoder_hfenn = Model(input_img, decoded)
autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')
autoencoder.summary()
# Training the Model
history = autoencoder.fit(x_train_down2, x_train_n2,
epochs=20,
batch_size=16,
validation_steps=100,
shuffle=True,
validation_split=0.15)
# Saving the Model
autoencoder.save('ISR_model_weight.h5')
# Represeting Model as JSON String
autoencoder_json = autoencoder.to_json()
with open('ISR_model.json', 'w') as json_file:
json_file.write(autoencoder_json)
Error:
2020-09-18 20:44:23.655077: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-18 20:44:23.658359: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.80G (4080218880 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:23.659070: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.42G (3672196864 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:25.560185: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudnn64_7.dll
Traceback (most recent call last):
File "D:\GPU testing\Image Super Resolution\Image Super Resolution using Autoencoders.py", line 126, in <module>
history = autoencoder.fit(x_train_down2, x_train_n2,
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1098, in fit
tmp_logs = train_function(iterator)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\def_function.py", line 780, in __call__
result = self._call(*args, **kwds)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\def_function.py", line 840, in _call
return self._stateless_fn(*args, **kwds)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 2829, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 1843, in _filtered_call
return self._call_flat(
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 1923, in _call_flat
return self._build_call_outputs(self._inference_function.call(
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 545, in call
outputs = execute.execute(
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\execute.py", line 59, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node functional_1/conv2d/Relu (defined at D:\GPU testing\Image Super Resolution\Image Super Resolution using Autoencoders.py:126) ]] [Op:__inference_train_function_2246]
Function call stack:
train_function
2020-09-18 20:44:19.489732: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:21.291233: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-09-18 20:44:21.306618: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x22a29eaa6b0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-09-18 20:44:21.308804: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2020-09-18 20:44:21.310433: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library nvcuda.dll
2020-09-18 20:44:22.424648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2020-09-18 20:44:22.425736: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:22.468696: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-09-18 20:44:23.161235: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-18 20:44:23.161847: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2020-09-18 20:44:23.162188: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2020-09-18 20:44:23.162708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-18 20:44:23.167626: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x22a52959fb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-09-18 20:44:23.168513: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): GeForce GTX 1650 Ti, Compute Capability 7.5
2020-09-18 20:44:23.642458: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2020-09-18 20:44:23.643553: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:23.647378: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-09-18 20:44:23.648372: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2020-09-18 20:44:23.649458: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:23.653267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-09-18 20:44:23.653735: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-18 20:44:23.654291: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2020-09-18 20:44:23.654631: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2020-09-18 20:44:23.655077: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-18 20:44:23.658359: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.80G (4080218880 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:23.659070: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.42G (3672196864 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:25.560185: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudnn64_7.dll
2020-09-18 20:44:26.855418: E tensorflow/stream_executor/cuda/cuda_dnn.cc:328] Could not create cudnn handle: CUDNN_STATUS_ALLOC_FAILED
2020-09-18 20:44:26.856558: E tensorflow/stream_executor/cuda/cuda_dnn.cc:328] Could not create cudnn handle: CUDNN_STATUS_ALLOC_FAILED
2020-09-18 20:44:26.857303: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at conv_ops_fused_impl.h:642 : Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
I have tried GPU growth:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
and also limiting GPU usage:
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction = 0.95)
session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
but they didn't resolve the issue.
I recently came across this article: What is Autoencoder? Enhance blurred images using autoencoders by Analytics Vidya, and tried the code provided and I faced the same error.
Can someone help me resolve this issue?

The conv2d op raised an error message:
Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
Looking above, we see
Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
failed to allocate 3.80G (4080218880 bytes) from device:
CUDA_ERROR_OUT_OF_MEMORY: out of memory
failed to allocate 3.42G (3672196864 bytes) from device:
CUDA_ERROR_OUT_OF_MEMORY: out of memory
So this graph would need more memory than there is available on your GeForce GTX 1650 Ti (3891 MB). Try using a smaller input image size and/or a smaller batch size.

The problem was with setting GPU growth for Tensorflow 2.3.0.
After setting it properly I could get rid of the error.
import tensorflow as tf
from tensorflow.compat.v1.keras.backend import set_session
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.compat.v1.Session(config=config)
set_session(sess)
Source: https://stackoverflow.com/a/59007505/14301371

Related

I am getting the same error "Could not load library cudnn_cnn_infer64_8.dll. Error code 126" after trying different versions of Cuda and Cudnn [duplicate]

Could not load library cudnn_cnn_infer64_8.dll. Error code 126
Please make sure cudnn_cnn_infer64_8.dll is in your library path!
I keep getting this error when I try to use TensorFlow with GPU, I've installed CUDA, cuDNN, and all the drivers multiple times according to the instructions. But nothing seems to work.
If I use notebook then TensorFlow uses the CPU, with VS code notebook extension i can use the gpu but it stops the session at 1st epoch, when I tried to run it as a normal python file. the above error occurred.
Complete terminal output:
Found 14630 validated image filenames belonging to 3 classes.
Found 1500 validated image filenames belonging to 3 classes.
2021-11-08 11:03:58.000354: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-08 11:03:58.603592: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2775 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1
Epoch 1/10
2021-11-08 11:04:07.306011: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8300
Could not load library cudnn_cnn_infer64_8.dll. Error code 126
Please make sure cudnn_cnn_infer64_8.dll is in your library path!
E:\MyWorkSpace\animal_detect>
The code snippet:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16
import pandas as pd
import numpy as np
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_gen = ImageDataGenerator(rescale = 1./255.,rotation_range = 40, width_shift_range = 0.2, height_shift_range = 0.2, shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True)
test_gen = ImageDataGenerator( rescale = 1.0/255. )
train_set = train_gen.flow_from_dataframe(train_df,x_col='loc',y_col='label',batch_size=20,target_size=(224,224))
test_set = train_gen.flow_from_dataframe(test_df,x_col='loc',y_col='label',batch_size=20,target_size=(224,224))
base_model = VGG16(input_shape = (224, 224, 3),
include_top = False,
weights = 'imagenet')
for layer in base_model.layers:
layer.trainable = False
x = layers.Flatten()(base_model.output)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(3, activation='sigmoid')(x)
model = tf.keras.models.Model(base_model.input, x)
model.compile(optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001), loss = 'categorical_crossentropy',metrics = ['acc'])
vgghist = model.fit(train_set, validation_data = test_set, steps_per_epoch = 100, epochs = 10)
the same code has been used for Jupyter-notebook, VS code notebook extension and as a normal python file
Device specifications:
processor: Intel i5
gpu: Nvidia Geforce 1050ti
Cuda version: 11.5
cuDNN version: 8.3
For those still having this issue, please make sure you also have completed this step:
Download, unzip and add zlibwapi.dll to your system path.
I wasted half an hour with this so you don't have to do it too. Good luck!
The same "errors" as me. Even though I have re-compiled the tensorflow-gpu 2.6.0 with "Cuda version: 11.5 cuDNN version: 8.3".
The "errors" disappeared when I changed cudnn version to 8.2 but kept cuda version as 11.5. (Re-compiled is als needed)
So I think this error must on "cuDNN".
Please See androidu's answer. It worked perfectly.

Why is my multi-class segmentation using tensorflow-gpu and keras utilising only 2% of the gpu during training?

I am trying to run my multi-class image segmentation problem using tensorflow-gpu as the backend to Keras on both a single gpu and multiple gpus. what i'm finding is the training is running extremely slowly. When I look at the utilisation I can see that the GPU is barely being used, around 2%. I have roughly 10,000 images and masks that are (224x224x3) each and I convert the masks to categorical friendly one-hot-encoded structure such that I have four classes and masks with shape (224x224x4). I am using a standard unet encoder, decoder architecture. Using the sequence class I have written my own custom generator that grabs both images and masks and does the preprocessing. I wondered whether my training is slow because my custom generator is some sort of bottleneck in the process? Am I doing too much preprocessing in the generator itself (i.e resizing images etc) I'm not sure how else to explain why it is taking so long to train. Below I have included three scripts 1. the unet model 2. the custom generator and 3. the segmentation script that compiles the model and trains it and calls the generators. Any help as to why this is happening would be massively appreciated.
I also believe that I'm correctly using tensorflow-gpu and the GPU avaliable becasue I get the following message
GPU Prolog Script v0.30
This is a GPU node.
Enough GPUs available.
Allocating card 1
2020-03-05 10:40:05.996313: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-03-05 10:40:06.078021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.582
pciBusID: 0000:03:00.0
2020-03-05 10:40:06.127190: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-03-05 10:40:06.221801: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2020-03-05 10:40:06.296413: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2020-03-05 10:40:06.379031: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2020-03-05 10:40:06.429316: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2020-03-05 10:40:06.485672: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2020-03-05 10:40:06.791850: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-03-05 10:40:06.796626: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2020-03-05 10:40:06.797199: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2020-03-05 10:40:06.813236: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2400010000 Hz
2020-03-05 10:40:06.815750: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x535a0a0 executing computations on platform Host. Devices:
2020-03-05 10:40:06.815778: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): Host, Default Version
2020-03-05 10:40:07.000335: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x53bd360 executing computations on platform CUDA. Devices:
2020-03-05 10:40:07.000385: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): GeForce GTX 1080 Ti, Compute Capability 6.1
2020-03-05 10:40:07.002638: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.582
pciBusID: 0000:03:00.0
2020-03-05 10:40:07.002714: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-03-05 10:40:07.002747: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2020-03-05 10:40:07.002774: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2020-03-05 10:40:07.002802: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2020-03-05 10:40:07.002829: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2020-03-05 10:40:07.002856: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2020-03-05 10:40:07.002884: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-03-05 10:40:07.010122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2020-03-05 10:40:07.023584: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2020-03-05 10:40:07.026875: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-03-05 10:40:07.026902: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2020-03-05 10:40:07.026919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2020-03-05 10:40:07.034045: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10481 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:03:00.0, compute capability: 6.1)
2020-03-05 10:54:36.697783: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-03-05 10:54:39.743744: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
import tensorflow as tf
from tensorflow import keras
import numpy as np
class Unet():
def __init__(self, imgDims, nOutput=1, finalActivation='sigmoid', activation='relu', padding='same'):
self.imgDims = imgDims
self.activation = activation
self.finalActivation = finalActivation
self.padding = padding
self.nOutput = nOutput
def convBlocks(self, x, filters, kernelSize=(3,3), padding='same', strides=1):
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Activation(self.activation)(x)
x = keras.layers.Conv2D(filters, kernelSize, padding=padding, strides=strides)(x)
return x
def identity(self, x, xInput, f, padding='same', strides=1):
skip = keras.layers.Conv2D(f, kernel_size=(1, 1), padding=padding, strides=strides)(xInput)
skip = keras.layers.BatchNormalization()(skip)
output = keras.layers.Add()([skip, x])
return output
def residualBlock(self, xIn, f, stride):
res = self.convBlocks(xIn, f, strides=stride)
res = self.convBlocks(res, f, strides=1)
output = self.identity(res, xIn, f, strides=stride)
return output
def upSampling(self, x, xInput):
x = keras.layers.UpSampling2D((2,2))(x)
x = keras.layers.Concatenate()([x, xInput])
return x
def encoder(self, x, filters, kernelSize=(3,3), padding='same', strides=1):
e1 = keras.layers.Conv2D(filters[0], kernelSize, padding=padding, strides=strides)(x)
e1 = self.convBlocks(e1, filters[0])
shortcut = keras.layers.Conv2D(filters[0], kernel_size=(1, 1), padding=padding, strides=strides)(x)
shortcut = keras.layers.BatchNormalization()(shortcut)
e1Output = keras.layers.Add()([e1, shortcut])
e2 = self.residualBlock(e1Output, filters[1], stride=2)
e3 = self.residualBlock(e2, filters[2], stride=2)
e4 = self.residualBlock(e3, filters[3], stride=2)
e5 = self.residualBlock(e4, filters[4], stride=2)
return e1Output, e2, e3, e4, e5
def bridge(self, x, filters):
b1 = self.convBlocks(x, filters, strides=1)
b2 = self.convBlocks(b1, filters, strides=1)
return b2
def decoder(self, b2, e1, e2, e3, e4, filters, kernelSize=(3,3), padding='same', strides=1):
x = self.upSampling(b2, e4)
d1 = self.convBlocks(x, filters[4])
d1 = self.convBlocks(d1, filters[4])
d1 = self.identity(d1, x, filters[4])
x = self.upSampling(d1, e3)
d2 = self.convBlocks(x, filters[3])
d2 = self.convBlocks(d2, filters[3])
d2 = self.identity(d2, x, filters[3])
x = self.upSampling(d2, e2)
d3 = self.convBlocks(x, filters[2])
d3 = self.convBlocks(d3, filters[2])
d3 = self.identity(d3, x, filters[2])
x = self.upSampling(d3, e1)
d4 = self.convBlocks(x, filters[1])
d4 = self.convBlocks(d4, filters[1])
d4 = self.identity(d4, x, filters[1])
return d4
def ResUnet(self, filters = [16, 32, 64, 128, 256]):
inputs = keras.layers.Input((self.imgDims, self.imgDims, 3))
e1, e2, e3, e4, e5 = self.encoder(inputs, filters)
b2 = self.bridge(e5, filters[4])
d4 = self.decoder(b2, e1, e2, e3, e4, filters)
x = keras.layers.Conv2D(self.nOutput, (1, 1), padding='same', activation=self.finalActivation)(d4)
model = keras.models.Model(inputs, x)
return model
2.
import cv2
import os
import numpy as np
from tensorflow import keras
from skimage import img_as_bool
from skimage.transform import resize
class DataGenerator(keras.utils.Sequence):
def __init__(self, imgIds, maskIds, imagePath, maskPath, weights=[1,1,1,1],
batchSize=16, imageSize = (224, 224, 3), nClasses=4, shuffle=False):
self.imgIds = imgIds
self.maskIds = maskIds
self.imagePath = imagePath
self.maskPath = maskPath
self.weights = np.array(weights)
self.batchSize = batchSize
self.imageSize = imageSize
self.nClasses = nClasses
self.shuffle = shuffle
'''
for each image id load the patch and corresponding mask
'''
def __load__(self, imgName, maskName):
img = cv2.imread(os.path.join(self.imagePath,imgName))
img = cv2.resize(img, (self.imageSize[0], self.imageSize[1]))
img = img/255.0
mask = cv2.imread(os.path.join(self.maskPath,maskName))
mask = img_as_bool(resize(mask, (self.imageSize[0], self.imageSize[1])))
mask = np.dstack((mask, np.zeros((224, 224))))
mask = mask.astype('uint16')
mask[:,:,3][mask[:,:,0]==0]=1
mask = self.weightMasks(mask)
return (img, mask)
'''
get the files for each batch (override __getitem__ method)
'''
def __getitem__(self, index):
if(index+1)*self.batchSize > len(self.imgIds):
self.batchSize = len(self.imgIds) - index*self.batchSize
batchImgs = self.imgIds[self.batchSize*index:self.batchSize*(index+1)]
batchMasks = self.maskIds[self.batchSize*index:self.batchSize*(index+1)]
batchfiles = [self.__load__(imgFile, maskFile) for imgFile, maskFile in zip(batchImgs, batchMasks)]
images, masks = zip(*batchfiles)
return np.array(list(images)), np.array(list(masks))
'''
Return number of steps per batch that are needed (override __len__ method)
'''
def __len__(self):
return int(np.ceil(len(self.imgIds)/self.batchSize))
3.
import os
import csv
import cv2
import glob
import numpy as np
import pickle
import random
import argparse
import json
import tensorflow as tf
from sklearn.utils import class_weight
from tensorflow import keras
from skimage.transform import resize
from skimage import img_as_bool
from tensorflow.keras import backend as K
from scripts.resunet_multi import Unet
from scripts.fcn8 import FCN
from scripts.utilities import saveModel, saveHistory
from scripts.evaluation import dice_coef_loss, dice_coef
from scripts.custom_datagenerator_three import DataGenerator
from scripts.custom_loss_functions import weightedCatXEntropy
def getPrediction(model, validGenerator, validIds):
steps = len(validIds)//validGenerator.batchSize
for i in range(0, steps):
x, y = validGenerator.__getitem__(i)
y[y==1]=255
masks.append(y)
yPred = model.predict(x)
yPred = np.argmax(yPred, axis=3)
for img in yPred:
x, y = validGenerator.__getitem__(i)
y[y==1]=255
masks.append(y)
yPred = model.predict(x)
yPred = np.argmax(yPred, axis=3)
def trainSegmentationModel(args):
basePath = args['basepath']
imageDir = args['imagedir']
maskDir = args['maskdir']
if args['weightfile'] is not None:
with open(args['weightfile'], 'r') as txtFile:
weights = list(csv.reader(txtFile, delimiter=','))
with open(args['paramfile']) as jsonFile:
params = json.load(jsonFile)
print(params['nClasses'])
if args['model'] == 'unet':
unet = Unet(int(params['imageDims']), nOutput = int(params['nClasses']), finalActivation=params['final'])
model = unet.ResUnet()
elif args['model'] == 'fcn8':
fcn = FCN(int(params['imageDims']), nClasses = int(params['nClasses']), finalActivation=params['final'])
model = fcn.getFCN8()
epoch = int(params['epoch'])
ratio = float(params['ratio'])
imagePath = os.path.join(basePath, imageDir)
maskPath = os.path.join(basePath, maskDir)
imgIds = glob.glob(os.path.join(imagePath, '*'))
imgIds = [os.path.basename(f) for f in imgIds][:200]
maskIds = glob.glob(os.path.join(maskPath, '*'))
maskIds = [os.path.basename(f) for f in maskIds][:200]
trainNum = round(ratio*len(imgIds))
validNum = np.floor((len(imgIds) - trainNum))
trainIds = imgIds[:trainNum]
validIds = imgIds[trainNum:]
#testIds = imgIds[(trainNum+validNum):]
trainMasks = maskIds[:trainNum]
validMasks = maskIds[trainNum:]
#testMasks = maskIds[(trainNum+validNum):]
trainGenerator = DataGenerator(trainIds, trainMasks, imagePath, maskPath)
validGenerator = DataGenerator(validIds, validMasks, imagePath, maskPath)
#testGenerator = DataGenerator(testIds, validMasks, imagePath, maskPath)
trainSteps = len(trainIds)//trainGenerator.batchSize
validSteps = len(validIds)//validGenerator.batchSize
if args['weightfile'] is None:
for i in range(trainSteps):
_, m = trainGenerator.__getitem__(i)
mask = np.argmax(m, axis=3)
labels.append(mask.reshape(-1))
labels = [l.tolist() for l in labels]
labels = itertools.chain(*labels)
weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels)
#learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False
adam = keras.optimizers.Adam()
model.compile(optimizer=adam, loss=weightedCatXEntropy, metrics=[dice_coef])
trainSteps = len(trainIds)//trainGenerator.batchSize
validSteps = len(validIds)//validGenerator.batchSize
history = model.fit_generator(trainGenerator,
validation_data=validGenerator,
steps_per_epoch=trainSteps,
validation_steps=validSteps,
verbose=1,
epochs=epoch)
saveModel(model, args['name'])
saveHistory(history, args['name']+'_hist')
#getPrediction(model, validGenerator, validIds)
if __name__ == '__main__':
ap = argparse.ArgumentParser()
ap.add_argument('-bp', '--basepath', required=True, help='path to image and mask directories')
ap.add_argument('-ip', '--imagedir', required=True, help='path to image directory')
ap.add_argument('-mp', '--maskdir', required=True, help='path to image directory')
ap.add_argument('-m', '--model', required=True, help='neural network model to use')
ap.add_argument('-n', '--name', required=True, help='name to save the model with')
ap.add_argument('-wf', '--weightfile', help='file containing list of class weights for unbalanced datasets')
ap.add_argument('-pf', '--paramfile', help='file containing parameters')
args = vars(ap.parse_args())
trainSegmentationModel(args)
You could try running a profiling to your training. There's a nice tutorial here:
https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras
Note that in some cases, it is not very easy to follow and understand it, but it may be very useful as well.
One more tip: Given that you process the images and masks with several operations, I would seriously consider to preprocess the whole training and validation sets, so that in your generator you only have to read them from files and nothing more. This way, it is highly probable that you save critical time at training (and validation) time for every epoch.
Hope it helps!

Keras loaded model is Not working

I trained a model with Keras, saved it using model.save(), and from Keras Documentation i don't need to save any thing else or compile the model after loading.
When i load it to test it on different images it gives this error:
totalMemory: 5.93GiB freeMemory: 5.41GiB 2018-05-17 10:10:53.265572: I
tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating
TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX
1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability:
6.1)
2018-05-17 10:10:55.939415: E tensorflow/stream_executor/cuda/cuda_dnn.cc:385] could not create
cudnn handle: CUDNN_STATUS_INTERNAL_ERROR
2018-05-17 10:10:55.939452:
E tensorflow/stream_executor/cuda/cuda_dnn.cc:352] could not destroy
cudnn handle: CUDNN_STATUS_BAD_PARAM
2018-05-17 10:10:55.939459: F
tensorflow/core/kernels/conv_ops.cc:667] Check failed:
stream->parent()->GetConvolveAlgorithms(
conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms)
Aborted (core dumped)
Here is the code i am using:
num_classes = 17
model = load_model('model.h5')
img1 = cv2.resize(cv2.cvtColor(cv2.imread("s_0.jpg"), cv2.COLOR_BGR2RGB), (24,24))
img2 = cv2.resize(cv2.cvtColor(cv2.imread("s_f.jpg"), cv2.COLOR_BGR2RGB), (24,24))
img3 = cv2.resize(cv2.cvtColor(cv2.imread("s_2.jpg"), cv2.COLOR_BGR2RGB), (24,24))
X_test = np.array([img1,img2,img3])
Y_test = to_categorical(np.array([0,12,2]), num_classes)
Y_predict = model.predict(X_test)
print np.argmax(Y_predict,axis = 1)
When i use the exact code for testing just after training (model is available not loaded), it works fine.
It looks like your CUDA is broken. Test it by disabling the GPU export CUDA_VISIBLE_DEVICES=-1.

Python/Tensorflow crashes with no message upon running Dataset.make_initializable_iterator()

I am running following code on tensorflow-gpu (GTX1080):
data_A = tf.data.Dataset.from_tensor_slices(rainy[:606]) #shape (606, 256, 256, 3), should be abt. 476 MiB
data_B = tf.data.Dataset.from_tensor_slices(sunny) #shape (606, 256, 256, 3), should be abt. 476 MiB
print("size of array in memory: "+ str(sys.getsizeof(sunny)))
dataset = tf.data.Dataset.zip((data_A, data_B))
batched_dataset = dataset.batch(4)
iterator = batched_dataset.make_initializable_iterator()
next_element = iterator.get_next()
print("before initializing iterator")
sess.run(iterator.initializer)
print("after initializing iterator")
It crashes with no message before reaching the last print line as can be seen from the output:
2018-01-23 23:04:33.252044: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\35\tensorflow\core\common_runtime\gpu\gpu_device.cc:1105] Found device 0 with properties:
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7715
pciBusID: 0000:01:00.0
totalMemory: 8.00GiB freeMemory: 6.61GiB
2018-01-23 23:04:33.252461: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\35\tensorflow\core\common_runtime\gpu\gpu_device.cc:1195] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1)
size of array in memory: 476577936
before initializing iterator
Process finished with exit code -1073740791 (0xC0000409)
edit: found following on the error code: The error code STATUS_STACK_BUFFER_OVERRUN (0xc0000409) refers to a stack buffer overflow
from: https://blogs.technet.microsoft.com/srd/2009/01/28/stack-overflow-stack-exhaustion-not-the-same-as-stack-buffer-overflow/

Tensorflow freezes on session run when reading data from tf records

Here is the code:
import tensorflow as tf
import sys
from tensorflow.python.platform import gfile
import numpy as np
from scipy.misc import imread
import glob
with open("./labels_510.txt") as f:
lines = list(f.readlines())
labels = [str(w).replace("\n", "") for w in lines]
NCLASS = len(labels)
NCHANNEL = 3
WIDTH = 224
HEIGHT = 224
def getImageBatch(filenames, batch_size, capacity, min_after_dequeue):
filenameQ = tf.train.string_input_producer(filenames, num_epochs=None)
recordReader = tf.TFRecordReader()
key, fullExample = recordReader.read(filenameQ)
key_val = sess.run(key)
print(key_val)
features = tf.parse_single_example(
fullExample,
features={
'image/height': tf.FixedLenFeature([], tf.int64),
'image/width': tf.FixedLenFeature([], tf.int64),
'image/colorspace': tf.FixedLenFeature([], dtype=tf.string, default_value=''),
'image/channels': tf.FixedLenFeature([], tf.int64),
'image/class/label': tf.FixedLenFeature([], tf.int64),
'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''),
'image/format': tf.FixedLenFeature([], dtype=tf.string, default_value=''),
'image/filename': tf.FixedLenFeature([], dtype=tf.string, default_value=''),
'image/encoded': tf.FixedLenFeature([], dtype=tf.string, default_value='')
})
label = features['image/class/label']
image_buffer = features['image/encoded']
with tf.name_scope('decode_jpeg', [image_buffer], None):
image = tf.image.decode_jpeg(image_buffer, channels=NCHANNEL)
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
image = tf.reshape(1 - tf.image.rgb_to_grayscale(image), [WIDTH * HEIGHT * NCHANNEL])
label = tf.stack(tf.one_hot(label - 1, NCLASS))
imageBatch, labelBatch = tf.train.shuffle_batch(
[image, label], batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
print(imageBatch.shape)
print(labelBatch.shape)
return imageBatch, labelBatch
with gfile.FastGFile("./output_graph_510.pb", 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Session() as sess:
sess.graph.as_default()
tf.import_graph_def(graph_def)
tf.global_variables_initializer().run()
image_tensor, label_batch = getImageBatch(glob.glob("./images/tf_records/validation*"), 1, 10, 2)
image_tensor = tf.reshape(image_tensor, (1, WIDTH, HEIGHT, NCHANNEL))
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
image_data = sess.run(image_tensor)
# print(image_data.shape)
# softmax_tensor = sess.graph.get_tensor_by_name('import/final_result:0')
# predictions = sess.run(softmax_tensor, {'import/input:0': image_data})
# predictions = np.squeeze(predictions)
# print(predictions)
coord.request_stop()
coord.join(threads)
When I run it, it freezes with the following message:
2017-08-17 12:33:10.235086: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-17 12:33:10.235099: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-17 12:33:10.235101: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-08-17 12:33:10.235104: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-08-17 12:33:10.235106: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
2017-08-17 12:33:10.322321: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:893] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2017-08-17 12:33:10.322510: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
name: GeForce GTX 1050
major: 6 minor: 1 memoryClockRate (GHz) 1.493
pciBusID 0000:01:00.0
Total memory: 3.95GiB
Free memory: 2.23GiB
2017-08-17 12:33:10.322519: I tensorflow/core/common_runtime/gpu/gpu_device.cc:961] DMA: 0
2017-08-17 12:33:10.322522: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: Y
2017-08-17 12:33:10.322529: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0)
Tensorflow version: 1.2.1
Ubuntu 16.04
GeForce GTX 1050
Full project can be found here: https://github.com/kindlychung/demo-load-pb-tensorflow
So it freezes as you didn't initialize the local variables associated with the queue used in tf.train.shuffle_batch. Local variables are in general temporary variables created for operations such as enqueue and dequeue to keep track of elements.
...
sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
image_data = sess.run(image_tensor)
print(image_data.shape)
...