Failed to get convolution algorithm Tensorflow 2.3.0 - tensorflow
I recently finished the Image super-resolution using Autoencoders in Coursera and when I try to run the same code on my laptop using Spyder and Jupyter notebook, I keep getting this error.
I am using Nvidia GeForce 1650Ti along with Tensorflow-gpu=2.3.0, CUDA=10.1, cuDNN=7.6.5 and python=3.8.5. I have used the same configurations for running many deep neural network problems and none of them gave this error.
Code:
# Image Super Resolution using Autoencoder
# Loading the Images
x_train_n = []
x_train_down = []
x_train_n2 = []
x_train_down2 = []
import tensorflow as tf
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction = 0.95)
session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
path = 'D:/GPU testing/Image Super Resolution/data/cars_train/'
images = os.listdir(path)
size = 0
for a in images:
try:
img = image.load_img(str(path+a), target_size=(64,64,3))
img_1 = image.img_to_array(img)
img_1 = img_1/255.
x_train_n.append(img_1)
dwn2 = rescale(rescale(img_1, 0.5, multichannel=True),
2.0, multichannel=True)
img_2 = image.img_to_array(dwn2)
x_train_down.append(img_2)
size+= 1
except:
print("Error loading image")
size += 1
if size >= 64:
break
x_train_n2 = np.array(x_train_n)
print(x_train_n2.shape)
x_train_down2 = np.array(x_train_down)
print(x_train_down2.shape)
# Building a Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Conv2DTranspose, UpSampling2D, add
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
# Building the Encoder
input_img = Input(shape=(64, 64, 3))
l1 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(input_img)
l2 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l1)
l3 = MaxPooling2D(padding='same')(l2)
l3 = Dropout(0.3)(l3)
l4 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l3)
l5 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l4)
l6 = MaxPooling2D(padding='same')(l5)
l7 = Conv2D(256, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l6)
# Building the Decoder
l8 = UpSampling2D()(l7)
l9 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l8)
l10 = Conv2D(128, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l9)
l11 = add([l5, l10])
l12 = UpSampling2D()(l11)
l13 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l12)
l14 = Conv2D(64, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l13)
l15 = add([l14, l2])
# chan = 3, for RGB
decoded = Conv2D(3, (3, 3), padding='same', activation='relu',
activity_regularizer=regularizers.l1(10e-10))(l15)
# Create our network
autoencoder = Model(input_img, decoded)
autoencoder_hfenn = Model(input_img, decoded)
autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')
autoencoder.summary()
# Training the Model
history = autoencoder.fit(x_train_down2, x_train_n2,
epochs=20,
batch_size=16,
validation_steps=100,
shuffle=True,
validation_split=0.15)
# Saving the Model
autoencoder.save('ISR_model_weight.h5')
# Represeting Model as JSON String
autoencoder_json = autoencoder.to_json()
with open('ISR_model.json', 'w') as json_file:
json_file.write(autoencoder_json)
Error:
2020-09-18 20:44:23.655077: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-18 20:44:23.658359: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.80G (4080218880 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:23.659070: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.42G (3672196864 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:25.560185: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudnn64_7.dll
Traceback (most recent call last):
File "D:\GPU testing\Image Super Resolution\Image Super Resolution using Autoencoders.py", line 126, in <module>
history = autoencoder.fit(x_train_down2, x_train_n2,
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1098, in fit
tmp_logs = train_function(iterator)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\def_function.py", line 780, in __call__
result = self._call(*args, **kwds)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\def_function.py", line 840, in _call
return self._stateless_fn(*args, **kwds)
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 2829, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 1843, in _filtered_call
return self._call_flat(
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 1923, in _call_flat
return self._build_call_outputs(self._inference_function.call(
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\function.py", line 545, in call
outputs = execute.execute(
File "D:\anaconda3\envs\tensorflow_gpu\lib\site-packages\tensorflow\python\eager\execute.py", line 59, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node functional_1/conv2d/Relu (defined at D:\GPU testing\Image Super Resolution\Image Super Resolution using Autoencoders.py:126) ]] [Op:__inference_train_function_2246]
Function call stack:
train_function
2020-09-18 20:44:19.489732: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:21.291233: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2020-09-18 20:44:21.306618: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x22a29eaa6b0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-09-18 20:44:21.308804: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version
2020-09-18 20:44:21.310433: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library nvcuda.dll
2020-09-18 20:44:22.424648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2020-09-18 20:44:22.425736: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:22.468696: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-09-18 20:44:23.161235: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-18 20:44:23.161847: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2020-09-18 20:44:23.162188: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2020-09-18 20:44:23.162708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-18 20:44:23.167626: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x22a52959fb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-09-18 20:44:23.168513: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): GeForce GTX 1650 Ti, Compute Capability 7.5
2020-09-18 20:44:23.642458: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2020-09-18 20:44:23.643553: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:23.647378: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-09-18 20:44:23.648372: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2020-09-18 20:44:23.649458: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll
2020-09-18 20:44:23.653267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0
2020-09-18 20:44:23.653735: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-09-18 20:44:23.654291: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0
2020-09-18 20:44:23.654631: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N
2020-09-18 20:44:23.655077: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
2020-09-18 20:44:23.658359: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.80G (4080218880 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:23.659070: I tensorflow/stream_executor/cuda/cuda_driver.cc:775] failed to allocate 3.42G (3672196864 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2020-09-18 20:44:25.560185: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudnn64_7.dll
2020-09-18 20:44:26.855418: E tensorflow/stream_executor/cuda/cuda_dnn.cc:328] Could not create cudnn handle: CUDNN_STATUS_ALLOC_FAILED
2020-09-18 20:44:26.856558: E tensorflow/stream_executor/cuda/cuda_dnn.cc:328] Could not create cudnn handle: CUDNN_STATUS_ALLOC_FAILED
2020-09-18 20:44:26.857303: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES failed at conv_ops_fused_impl.h:642 : Unknown: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
I have tried GPU growth:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
and also limiting GPU usage:
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction = 0.95)
session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
but they didn't resolve the issue.
I recently came across this article: What is Autoencoder? Enhance blurred images using autoencoders by Analytics Vidya, and tried the code provided and I faced the same error.
Can someone help me resolve this issue?
The conv2d op raised an error message:
Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
Looking above, we see
Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3891 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5)
failed to allocate 3.80G (4080218880 bytes) from device:
CUDA_ERROR_OUT_OF_MEMORY: out of memory
failed to allocate 3.42G (3672196864 bytes) from device:
CUDA_ERROR_OUT_OF_MEMORY: out of memory
So this graph would need more memory than there is available on your GeForce GTX 1650 Ti (3891 MB). Try using a smaller input image size and/or a smaller batch size.
The problem was with setting GPU growth for Tensorflow 2.3.0.
After setting it properly I could get rid of the error.
import tensorflow as tf
from tensorflow.compat.v1.keras.backend import set_session
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = True
sess = tf.compat.v1.Session(config=config)
set_session(sess)
Source: https://stackoverflow.com/a/59007505/14301371
Related
I am getting the same error "Could not load library cudnn_cnn_infer64_8.dll. Error code 126" after trying different versions of Cuda and Cudnn [duplicate]
Could not load library cudnn_cnn_infer64_8.dll. Error code 126 Please make sure cudnn_cnn_infer64_8.dll is in your library path! I keep getting this error when I try to use TensorFlow with GPU, I've installed CUDA, cuDNN, and all the drivers multiple times according to the instructions. But nothing seems to work. If I use notebook then TensorFlow uses the CPU, with VS code notebook extension i can use the gpu but it stops the session at 1st epoch, when I tried to run it as a normal python file. the above error occurred. Complete terminal output: Found 14630 validated image filenames belonging to 3 classes. Found 1500 validated image filenames belonging to 3 classes. 2021-11-08 11:03:58.000354: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2 To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2021-11-08 11:03:58.603592: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2775 MB memory: -> device: 0, name: NVIDIA GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1 Epoch 1/10 2021-11-08 11:04:07.306011: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8300 Could not load library cudnn_cnn_infer64_8.dll. Error code 126 Please make sure cudnn_cnn_infer64_8.dll is in your library path! E:\MyWorkSpace\animal_detect> The code snippet: import tensorflow as tf from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras import layers from tensorflow.keras import Model from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras.applications.vgg16 import VGG16 import pandas as pd import numpy as np train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') train_gen = ImageDataGenerator(rescale = 1./255.,rotation_range = 40, width_shift_range = 0.2, height_shift_range = 0.2, shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True) test_gen = ImageDataGenerator( rescale = 1.0/255. ) train_set = train_gen.flow_from_dataframe(train_df,x_col='loc',y_col='label',batch_size=20,target_size=(224,224)) test_set = train_gen.flow_from_dataframe(test_df,x_col='loc',y_col='label',batch_size=20,target_size=(224,224)) base_model = VGG16(input_shape = (224, 224, 3), include_top = False, weights = 'imagenet') for layer in base_model.layers: layer.trainable = False x = layers.Flatten()(base_model.output) x = layers.Dense(512, activation='relu')(x) x = layers.Dropout(0.5)(x) x = layers.Dense(3, activation='sigmoid')(x) model = tf.keras.models.Model(base_model.input, x) model.compile(optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001), loss = 'categorical_crossentropy',metrics = ['acc']) vgghist = model.fit(train_set, validation_data = test_set, steps_per_epoch = 100, epochs = 10) the same code has been used for Jupyter-notebook, VS code notebook extension and as a normal python file Device specifications: processor: Intel i5 gpu: Nvidia Geforce 1050ti Cuda version: 11.5 cuDNN version: 8.3
For those still having this issue, please make sure you also have completed this step: Download, unzip and add zlibwapi.dll to your system path. I wasted half an hour with this so you don't have to do it too. Good luck!
The same "errors" as me. Even though I have re-compiled the tensorflow-gpu 2.6.0 with "Cuda version: 11.5 cuDNN version: 8.3". The "errors" disappeared when I changed cudnn version to 8.2 but kept cuda version as 11.5. (Re-compiled is als needed) So I think this error must on "cuDNN". Please See androidu's answer. It worked perfectly.
Why is my multi-class segmentation using tensorflow-gpu and keras utilising only 2% of the gpu during training?
I am trying to run my multi-class image segmentation problem using tensorflow-gpu as the backend to Keras on both a single gpu and multiple gpus. what i'm finding is the training is running extremely slowly. When I look at the utilisation I can see that the GPU is barely being used, around 2%. I have roughly 10,000 images and masks that are (224x224x3) each and I convert the masks to categorical friendly one-hot-encoded structure such that I have four classes and masks with shape (224x224x4). I am using a standard unet encoder, decoder architecture. Using the sequence class I have written my own custom generator that grabs both images and masks and does the preprocessing. I wondered whether my training is slow because my custom generator is some sort of bottleneck in the process? Am I doing too much preprocessing in the generator itself (i.e resizing images etc) I'm not sure how else to explain why it is taking so long to train. Below I have included three scripts 1. the unet model 2. the custom generator and 3. the segmentation script that compiles the model and trains it and calls the generators. Any help as to why this is happening would be massively appreciated. I also believe that I'm correctly using tensorflow-gpu and the GPU avaliable becasue I get the following message GPU Prolog Script v0.30 This is a GPU node. Enough GPUs available. Allocating card 1 2020-03-05 10:40:05.996313: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 2020-03-05 10:40:06.078021: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.582 pciBusID: 0000:03:00.0 2020-03-05 10:40:06.127190: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0 2020-03-05 10:40:06.221801: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0 2020-03-05 10:40:06.296413: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0 2020-03-05 10:40:06.379031: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0 2020-03-05 10:40:06.429316: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0 2020-03-05 10:40:06.485672: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0 2020-03-05 10:40:06.791850: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 2020-03-05 10:40:06.796626: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0 2020-03-05 10:40:06.797199: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA 2020-03-05 10:40:06.813236: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2400010000 Hz 2020-03-05 10:40:06.815750: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x535a0a0 executing computations on platform Host. Devices: 2020-03-05 10:40:06.815778: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): Host, Default Version 2020-03-05 10:40:07.000335: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x53bd360 executing computations on platform CUDA. Devices: 2020-03-05 10:40:07.000385: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): GeForce GTX 1080 Ti, Compute Capability 6.1 2020-03-05 10:40:07.002638: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.582 pciBusID: 0000:03:00.0 2020-03-05 10:40:07.002714: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0 2020-03-05 10:40:07.002747: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0 2020-03-05 10:40:07.002774: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0 2020-03-05 10:40:07.002802: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0 2020-03-05 10:40:07.002829: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0 2020-03-05 10:40:07.002856: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0 2020-03-05 10:40:07.002884: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 2020-03-05 10:40:07.010122: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0 2020-03-05 10:40:07.023584: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0 2020-03-05 10:40:07.026875: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix: 2020-03-05 10:40:07.026902: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0 2020-03-05 10:40:07.026919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N 2020-03-05 10:40:07.034045: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10481 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:03:00.0, compute capability: 6.1) 2020-03-05 10:54:36.697783: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7 2020-03-05 10:54:39.743744: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0 import tensorflow as tf from tensorflow import keras import numpy as np class Unet(): def __init__(self, imgDims, nOutput=1, finalActivation='sigmoid', activation='relu', padding='same'): self.imgDims = imgDims self.activation = activation self.finalActivation = finalActivation self.padding = padding self.nOutput = nOutput def convBlocks(self, x, filters, kernelSize=(3,3), padding='same', strides=1): x = keras.layers.BatchNormalization()(x) x = keras.layers.Activation(self.activation)(x) x = keras.layers.Conv2D(filters, kernelSize, padding=padding, strides=strides)(x) return x def identity(self, x, xInput, f, padding='same', strides=1): skip = keras.layers.Conv2D(f, kernel_size=(1, 1), padding=padding, strides=strides)(xInput) skip = keras.layers.BatchNormalization()(skip) output = keras.layers.Add()([skip, x]) return output def residualBlock(self, xIn, f, stride): res = self.convBlocks(xIn, f, strides=stride) res = self.convBlocks(res, f, strides=1) output = self.identity(res, xIn, f, strides=stride) return output def upSampling(self, x, xInput): x = keras.layers.UpSampling2D((2,2))(x) x = keras.layers.Concatenate()([x, xInput]) return x def encoder(self, x, filters, kernelSize=(3,3), padding='same', strides=1): e1 = keras.layers.Conv2D(filters[0], kernelSize, padding=padding, strides=strides)(x) e1 = self.convBlocks(e1, filters[0]) shortcut = keras.layers.Conv2D(filters[0], kernel_size=(1, 1), padding=padding, strides=strides)(x) shortcut = keras.layers.BatchNormalization()(shortcut) e1Output = keras.layers.Add()([e1, shortcut]) e2 = self.residualBlock(e1Output, filters[1], stride=2) e3 = self.residualBlock(e2, filters[2], stride=2) e4 = self.residualBlock(e3, filters[3], stride=2) e5 = self.residualBlock(e4, filters[4], stride=2) return e1Output, e2, e3, e4, e5 def bridge(self, x, filters): b1 = self.convBlocks(x, filters, strides=1) b2 = self.convBlocks(b1, filters, strides=1) return b2 def decoder(self, b2, e1, e2, e3, e4, filters, kernelSize=(3,3), padding='same', strides=1): x = self.upSampling(b2, e4) d1 = self.convBlocks(x, filters[4]) d1 = self.convBlocks(d1, filters[4]) d1 = self.identity(d1, x, filters[4]) x = self.upSampling(d1, e3) d2 = self.convBlocks(x, filters[3]) d2 = self.convBlocks(d2, filters[3]) d2 = self.identity(d2, x, filters[3]) x = self.upSampling(d2, e2) d3 = self.convBlocks(x, filters[2]) d3 = self.convBlocks(d3, filters[2]) d3 = self.identity(d3, x, filters[2]) x = self.upSampling(d3, e1) d4 = self.convBlocks(x, filters[1]) d4 = self.convBlocks(d4, filters[1]) d4 = self.identity(d4, x, filters[1]) return d4 def ResUnet(self, filters = [16, 32, 64, 128, 256]): inputs = keras.layers.Input((self.imgDims, self.imgDims, 3)) e1, e2, e3, e4, e5 = self.encoder(inputs, filters) b2 = self.bridge(e5, filters[4]) d4 = self.decoder(b2, e1, e2, e3, e4, filters) x = keras.layers.Conv2D(self.nOutput, (1, 1), padding='same', activation=self.finalActivation)(d4) model = keras.models.Model(inputs, x) return model 2. import cv2 import os import numpy as np from tensorflow import keras from skimage import img_as_bool from skimage.transform import resize class DataGenerator(keras.utils.Sequence): def __init__(self, imgIds, maskIds, imagePath, maskPath, weights=[1,1,1,1], batchSize=16, imageSize = (224, 224, 3), nClasses=4, shuffle=False): self.imgIds = imgIds self.maskIds = maskIds self.imagePath = imagePath self.maskPath = maskPath self.weights = np.array(weights) self.batchSize = batchSize self.imageSize = imageSize self.nClasses = nClasses self.shuffle = shuffle ''' for each image id load the patch and corresponding mask ''' def __load__(self, imgName, maskName): img = cv2.imread(os.path.join(self.imagePath,imgName)) img = cv2.resize(img, (self.imageSize[0], self.imageSize[1])) img = img/255.0 mask = cv2.imread(os.path.join(self.maskPath,maskName)) mask = img_as_bool(resize(mask, (self.imageSize[0], self.imageSize[1]))) mask = np.dstack((mask, np.zeros((224, 224)))) mask = mask.astype('uint16') mask[:,:,3][mask[:,:,0]==0]=1 mask = self.weightMasks(mask) return (img, mask) ''' get the files for each batch (override __getitem__ method) ''' def __getitem__(self, index): if(index+1)*self.batchSize > len(self.imgIds): self.batchSize = len(self.imgIds) - index*self.batchSize batchImgs = self.imgIds[self.batchSize*index:self.batchSize*(index+1)] batchMasks = self.maskIds[self.batchSize*index:self.batchSize*(index+1)] batchfiles = [self.__load__(imgFile, maskFile) for imgFile, maskFile in zip(batchImgs, batchMasks)] images, masks = zip(*batchfiles) return np.array(list(images)), np.array(list(masks)) ''' Return number of steps per batch that are needed (override __len__ method) ''' def __len__(self): return int(np.ceil(len(self.imgIds)/self.batchSize)) 3. import os import csv import cv2 import glob import numpy as np import pickle import random import argparse import json import tensorflow as tf from sklearn.utils import class_weight from tensorflow import keras from skimage.transform import resize from skimage import img_as_bool from tensorflow.keras import backend as K from scripts.resunet_multi import Unet from scripts.fcn8 import FCN from scripts.utilities import saveModel, saveHistory from scripts.evaluation import dice_coef_loss, dice_coef from scripts.custom_datagenerator_three import DataGenerator from scripts.custom_loss_functions import weightedCatXEntropy def getPrediction(model, validGenerator, validIds): steps = len(validIds)//validGenerator.batchSize for i in range(0, steps): x, y = validGenerator.__getitem__(i) y[y==1]=255 masks.append(y) yPred = model.predict(x) yPred = np.argmax(yPred, axis=3) for img in yPred: x, y = validGenerator.__getitem__(i) y[y==1]=255 masks.append(y) yPred = model.predict(x) yPred = np.argmax(yPred, axis=3) def trainSegmentationModel(args): basePath = args['basepath'] imageDir = args['imagedir'] maskDir = args['maskdir'] if args['weightfile'] is not None: with open(args['weightfile'], 'r') as txtFile: weights = list(csv.reader(txtFile, delimiter=',')) with open(args['paramfile']) as jsonFile: params = json.load(jsonFile) print(params['nClasses']) if args['model'] == 'unet': unet = Unet(int(params['imageDims']), nOutput = int(params['nClasses']), finalActivation=params['final']) model = unet.ResUnet() elif args['model'] == 'fcn8': fcn = FCN(int(params['imageDims']), nClasses = int(params['nClasses']), finalActivation=params['final']) model = fcn.getFCN8() epoch = int(params['epoch']) ratio = float(params['ratio']) imagePath = os.path.join(basePath, imageDir) maskPath = os.path.join(basePath, maskDir) imgIds = glob.glob(os.path.join(imagePath, '*')) imgIds = [os.path.basename(f) for f in imgIds][:200] maskIds = glob.glob(os.path.join(maskPath, '*')) maskIds = [os.path.basename(f) for f in maskIds][:200] trainNum = round(ratio*len(imgIds)) validNum = np.floor((len(imgIds) - trainNum)) trainIds = imgIds[:trainNum] validIds = imgIds[trainNum:] #testIds = imgIds[(trainNum+validNum):] trainMasks = maskIds[:trainNum] validMasks = maskIds[trainNum:] #testMasks = maskIds[(trainNum+validNum):] trainGenerator = DataGenerator(trainIds, trainMasks, imagePath, maskPath) validGenerator = DataGenerator(validIds, validMasks, imagePath, maskPath) #testGenerator = DataGenerator(testIds, validMasks, imagePath, maskPath) trainSteps = len(trainIds)//trainGenerator.batchSize validSteps = len(validIds)//validGenerator.batchSize if args['weightfile'] is None: for i in range(trainSteps): _, m = trainGenerator.__getitem__(i) mask = np.argmax(m, axis=3) labels.append(mask.reshape(-1)) labels = [l.tolist() for l in labels] labels = itertools.chain(*labels) weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels) #learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False adam = keras.optimizers.Adam() model.compile(optimizer=adam, loss=weightedCatXEntropy, metrics=[dice_coef]) trainSteps = len(trainIds)//trainGenerator.batchSize validSteps = len(validIds)//validGenerator.batchSize history = model.fit_generator(trainGenerator, validation_data=validGenerator, steps_per_epoch=trainSteps, validation_steps=validSteps, verbose=1, epochs=epoch) saveModel(model, args['name']) saveHistory(history, args['name']+'_hist') #getPrediction(model, validGenerator, validIds) if __name__ == '__main__': ap = argparse.ArgumentParser() ap.add_argument('-bp', '--basepath', required=True, help='path to image and mask directories') ap.add_argument('-ip', '--imagedir', required=True, help='path to image directory') ap.add_argument('-mp', '--maskdir', required=True, help='path to image directory') ap.add_argument('-m', '--model', required=True, help='neural network model to use') ap.add_argument('-n', '--name', required=True, help='name to save the model with') ap.add_argument('-wf', '--weightfile', help='file containing list of class weights for unbalanced datasets') ap.add_argument('-pf', '--paramfile', help='file containing parameters') args = vars(ap.parse_args()) trainSegmentationModel(args)
You could try running a profiling to your training. There's a nice tutorial here: https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras Note that in some cases, it is not very easy to follow and understand it, but it may be very useful as well. One more tip: Given that you process the images and masks with several operations, I would seriously consider to preprocess the whole training and validation sets, so that in your generator you only have to read them from files and nothing more. This way, it is highly probable that you save critical time at training (and validation) time for every epoch. Hope it helps!
Keras loaded model is Not working
I trained a model with Keras, saved it using model.save(), and from Keras Documentation i don't need to save any thing else or compile the model after loading. When i load it to test it on different images it gives this error: totalMemory: 5.93GiB freeMemory: 5.41GiB 2018-05-17 10:10:53.265572: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 6.1) 2018-05-17 10:10:55.939415: E tensorflow/stream_executor/cuda/cuda_dnn.cc:385] could not create cudnn handle: CUDNN_STATUS_INTERNAL_ERROR 2018-05-17 10:10:55.939452: E tensorflow/stream_executor/cuda/cuda_dnn.cc:352] could not destroy cudnn handle: CUDNN_STATUS_BAD_PARAM 2018-05-17 10:10:55.939459: F tensorflow/core/kernels/conv_ops.cc:667] Check failed: stream->parent()->GetConvolveAlgorithms( conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), &algorithms) Aborted (core dumped) Here is the code i am using: num_classes = 17 model = load_model('model.h5') img1 = cv2.resize(cv2.cvtColor(cv2.imread("s_0.jpg"), cv2.COLOR_BGR2RGB), (24,24)) img2 = cv2.resize(cv2.cvtColor(cv2.imread("s_f.jpg"), cv2.COLOR_BGR2RGB), (24,24)) img3 = cv2.resize(cv2.cvtColor(cv2.imread("s_2.jpg"), cv2.COLOR_BGR2RGB), (24,24)) X_test = np.array([img1,img2,img3]) Y_test = to_categorical(np.array([0,12,2]), num_classes) Y_predict = model.predict(X_test) print np.argmax(Y_predict,axis = 1) When i use the exact code for testing just after training (model is available not loaded), it works fine.
It looks like your CUDA is broken. Test it by disabling the GPU export CUDA_VISIBLE_DEVICES=-1.
Python/Tensorflow crashes with no message upon running Dataset.make_initializable_iterator()
I am running following code on tensorflow-gpu (GTX1080): data_A = tf.data.Dataset.from_tensor_slices(rainy[:606]) #shape (606, 256, 256, 3), should be abt. 476 MiB data_B = tf.data.Dataset.from_tensor_slices(sunny) #shape (606, 256, 256, 3), should be abt. 476 MiB print("size of array in memory: "+ str(sys.getsizeof(sunny))) dataset = tf.data.Dataset.zip((data_A, data_B)) batched_dataset = dataset.batch(4) iterator = batched_dataset.make_initializable_iterator() next_element = iterator.get_next() print("before initializing iterator") sess.run(iterator.initializer) print("after initializing iterator") It crashes with no message before reaching the last print line as can be seen from the output: 2018-01-23 23:04:33.252044: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\35\tensorflow\core\common_runtime\gpu\gpu_device.cc:1105] Found device 0 with properties: name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7715 pciBusID: 0000:01:00.0 totalMemory: 8.00GiB freeMemory: 6.61GiB 2018-01-23 23:04:33.252461: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\35\tensorflow\core\common_runtime\gpu\gpu_device.cc:1195] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1) size of array in memory: 476577936 before initializing iterator Process finished with exit code -1073740791 (0xC0000409) edit: found following on the error code: The error code STATUS_STACK_BUFFER_OVERRUN (0xc0000409) refers to a stack buffer overflow from: https://blogs.technet.microsoft.com/srd/2009/01/28/stack-overflow-stack-exhaustion-not-the-same-as-stack-buffer-overflow/
Tensorflow freezes on session run when reading data from tf records
Here is the code: import tensorflow as tf import sys from tensorflow.python.platform import gfile import numpy as np from scipy.misc import imread import glob with open("./labels_510.txt") as f: lines = list(f.readlines()) labels = [str(w).replace("\n", "") for w in lines] NCLASS = len(labels) NCHANNEL = 3 WIDTH = 224 HEIGHT = 224 def getImageBatch(filenames, batch_size, capacity, min_after_dequeue): filenameQ = tf.train.string_input_producer(filenames, num_epochs=None) recordReader = tf.TFRecordReader() key, fullExample = recordReader.read(filenameQ) key_val = sess.run(key) print(key_val) features = tf.parse_single_example( fullExample, features={ 'image/height': tf.FixedLenFeature([], tf.int64), 'image/width': tf.FixedLenFeature([], tf.int64), 'image/colorspace': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/channels': tf.FixedLenFeature([], tf.int64), 'image/class/label': tf.FixedLenFeature([], tf.int64), 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/format': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/filename': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, default_value='') }) label = features['image/class/label'] image_buffer = features['image/encoded'] with tf.name_scope('decode_jpeg', [image_buffer], None): image = tf.image.decode_jpeg(image_buffer, channels=NCHANNEL) image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = tf.reshape(1 - tf.image.rgb_to_grayscale(image), [WIDTH * HEIGHT * NCHANNEL]) label = tf.stack(tf.one_hot(label - 1, NCLASS)) imageBatch, labelBatch = tf.train.shuffle_batch( [image, label], batch_size=batch_size, capacity=capacity, min_after_dequeue=min_after_dequeue) print(imageBatch.shape) print(labelBatch.shape) return imageBatch, labelBatch with gfile.FastGFile("./output_graph_510.pb", 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Session() as sess: sess.graph.as_default() tf.import_graph_def(graph_def) tf.global_variables_initializer().run() image_tensor, label_batch = getImageBatch(glob.glob("./images/tf_records/validation*"), 1, 10, 2) image_tensor = tf.reshape(image_tensor, (1, WIDTH, HEIGHT, NCHANNEL)) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) image_data = sess.run(image_tensor) # print(image_data.shape) # softmax_tensor = sess.graph.get_tensor_by_name('import/final_result:0') # predictions = sess.run(softmax_tensor, {'import/input:0': image_data}) # predictions = np.squeeze(predictions) # print(predictions) coord.request_stop() coord.join(threads) When I run it, it freezes with the following message: 2017-08-17 12:33:10.235086: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations. 2017-08-17 12:33:10.235099: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations. 2017-08-17 12:33:10.235101: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations. 2017-08-17 12:33:10.235104: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations. 2017-08-17 12:33:10.235106: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations. 2017-08-17 12:33:10.322321: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:893] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2017-08-17 12:33:10.322510: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: name: GeForce GTX 1050 major: 6 minor: 1 memoryClockRate (GHz) 1.493 pciBusID 0000:01:00.0 Total memory: 3.95GiB Free memory: 2.23GiB 2017-08-17 12:33:10.322519: I tensorflow/core/common_runtime/gpu/gpu_device.cc:961] DMA: 0 2017-08-17 12:33:10.322522: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: Y 2017-08-17 12:33:10.322529: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0) Tensorflow version: 1.2.1 Ubuntu 16.04 GeForce GTX 1050 Full project can be found here: https://github.com/kindlychung/demo-load-pb-tensorflow
So it freezes as you didn't initialize the local variables associated with the queue used in tf.train.shuffle_batch. Local variables are in general temporary variables created for operations such as enqueue and dequeue to keep track of elements. ... sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) image_data = sess.run(image_tensor) print(image_data.shape) ...