How to relate kernel input data structure in CUDA kernel function with parameter input in pycuda - numpy

I am writing a cuda kernel to convert rgba image to gray scale image in pycuda, here is the PyCUDA code:
import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
kernel = SourceModule("""
#include <stdio.h>
__global__ void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
int y = threadIdx.y+ blockIdx.y* blockDim.y;
int x = threadIdx.x+ blockIdx.x* blockDim.x;
if (y < numCols && x < numRows) {
int index = numRows*y +x;
uchar4 color = rgbaImage[index];
unsigned char grey = (unsigned char)(0.299f*color.x+ 0.587f*color.y +
greyImage[index] = grey;
However, the problem is how to relate uchar4* to numpy array. I know can modify my kernel function to accept int* or float*, and make it work. But I just wonder how to make the above kernel function to work in pycuda.
Below is host code.
def gpu_rgb2gray(image):
shape = image.shape
n_rows, n_cols, _ = np.array(shape,
image_gray = np.empty((n_rows, n_cols), dtype=
## HERE is confusing part, how to rearrange image to match unchar4* ??
image = image.reshape(1, -1, 4)
# Get kernel function
rgba2gray = kernel.get_function("rgba_to_greyscale")
# Define block, grid and compute
blockDim = (32, 32, 1) # 1024 threads in total
dx, mx = divmod(shape[1], blockDim[0])
dy, my = divmod(shape[0], blockDim[1])
gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
# Kernel function
# HERE doesn't work because of mismatch
rgba2gray (
cuda.In(image), cuda.Out(image_gray), n_rows, n_cols,
block=blockDim, grid=gridDim)
return image_gray
Anyone have any ideas? Thanks!

The gpuarray class has native support for CUDA's built in vector types (including uchar4).
So you can create as gpuarray instance with the correct dtype for the kernel, and copy the host image to that gpuarray using buffers, then use the gpuarray as the kernel input argument. As an example (and if I understood your code correctly), something like this should probably work:
import pycuda.gpuarray as gpuarray
def gpu_rgb2gray(image):
shape = image.shape
image_rgb = gpuarray.empty(shape, dtype=gpuarray.vec.uchar4)
image_gray = gpuarray.empty(shape, dtype=np.uint8)
# Get kernel function
rgba2gray = kernel.get_function("rgba_to_greyscale")
# Define block, grid and compute
blockDim = (32, 32, 1) # 1024 threads in total
dx, mx = divmod(shape[1], blockDim[0])
dy, my = divmod(shape[0], blockDim[1])
gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
rgba2gray ( image_rgb, image_gray, np.int32(shape[0]), np.int32(shape[1]), block=blockDim, grid=gridDim)
img_gray = np.array(image_gray.get(),
return img_gray
this would take an image of 32 bit unsigned integers and copy them to an array of uchar4 on the GPU and then upcast the resulting array of uchar back to integers on the device.


How to invoke a kernel which has a double pointer as argument?

I'm trying to invoke a custom kernel with a double pointer as argument using CuPy.
Given a dataset of size n of d-dimensional points, I allocate an array of n floats for each dimension.
Then I put all the pointers to those arrays into another array. That's why I'm dealing with a double pointer.
My custom kernel has a signature like:
__global__ void myKernel(float **dataset, int n, int d, int* output, ...)
In native cuda code (compiled through nvcc) my code works as expected.
My python attempt to invoke the kernel looks like:
import cupy as cp
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1_000, n_features=2, cluster_std=.5, center_box=(- 10.0, 10.0), shuffle=True, random_state=None)
with open('path', 'r') as f:
code =
output = cp.empty(len(X), dtype=cp.int32)
kern = cp.RawKernel(code, 'myKernel')
d1 = cp.array(X[:, 0])
d2 = cp.array(X[:, 1])
blocks = ...
grid = ...
args = (cp.array([,]), 2, len(X), output, ...)
shared_mem = ...
kern((grid, 1, 1), (blocks, 1, 1), args=args, shared_mem=shared_mem)
As you can probably spot, my issue is in the args = ... row.
The code does not raise errors, but the data in the output vector is definitely wrong.
Am I passing the data properly? Is there a better way?
Could you try creating a minimum reproducer? I see no problem passing a double-pointer argument.
import cupy as cp
import sys
code = '''
extern "C" __global__ void myKernel(float **dataset, int n, int* output) {
for (int i = 0; i < n; ++i) {
for (int j = 0; j < 10; ++j) {
printf("%f, ", dataset[i][j]);
kern = cp.RawKernel(code, 'myKernel')
d0 = cp.arange(10, dtype=cp.float32)
d1 = cp.arange(10, dtype=cp.float32) * 2
dataset = cp.array([,])
output = cp.empty(10, dtype=cp.int32)
def run():
kern((1, 1, 1), (1, 1, 1), args=(dataset, len(dataset), output))
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
[ 0. 2. 4. 6. 8. 10. 12. 14. 16. 18.]
[0.000000, 1.000000, 2.000000, 3.000000, 4.000000, 5.000000, 6.000000, 7.000000, 8.000000, 9.000000, ]
[0.000000, 2.000000, 4.000000, 6.000000, 8.000000, 10.000000, 12.000000, 14.000000, 16.000000, 18.000000, ]

pycuda - memcpy_dtoh, not giving what appears to have been set

I have a very simple function where I'm passing in a char array and doing a simple character match. I want to return an array of 1/0 depending on which characters are matched.
Problem: although I can see the value has been set in the data structure (as I print it in the function after it's assigned) when the int array is copied back from the device the values aren't as expected.
I'm sure it's something silly.
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
mod = SourceModule("""
__global__ void test(const char *q, const int chrSize, int *d, const int intSize) {
int v = 0;
if( q[threadIdx.x * chrSize] == 'a' || q[threadIdx.x * chrSize] == 'c' ) {
v = 1;
d[threadIdx.x * intSize] = v;
printf("x=%d, y=%d, val=%c ret=%d\\n", threadIdx.x, threadIdx.y, q[threadIdx.x * chrSize], d[threadIdx.x * intSize]);
func = mod.get_function("test")
# input data
a = np.asarray(['a','b','c','d'], dtype=np.str_)
# allocate/copy to device
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
# destination array
d = np.zeros((4), dtype=np.int16)
# allocate/copy to device
d_gpu = cuda.mem_alloc(d.nbytes)
cuda.memcpy_htod(d_gpu, d)
# run the function
func(a_gpu, np.int8(a.dtype.itemsize), d_gpu, np.int8(d.dtype.itemsize), block=(4,1,1))
# copy data back and priint
cuda.memcpy_dtoh(d, d_gpu)
x=0, y=0, val=a ret=1
x=1, y=0, val=b ret=0
x=2, y=0, val=c ret=1
x=3, y=0, val=d ret=0
[1 0 0 0]
Expected output:
x=0, y=0, val=a ret=1
x=1, y=0, val=b ret=0
x=2, y=0, val=c ret=1
x=3, y=0, val=d ret=0
[1 0 1 0]
You have two main problems, neither of which have anything to do with memcpy_dtoh:
You have declared d and d_gpu as dtype np.int16, but the kernel is expecting C++ int, leading to a type mistmatch. You should use the np.int32 type to define the arrays.
The indexing of d within the kernel is incorrect. If you have declared the array to the compiler as a 32 bit type, indexing the array as d[threadIdx.x] will automatically include the correct alignment for the type. Passing and using intSize to the kernel for indexing d is not required and it is incorrect to do so.
If you fix those two issues, I suspect the code will work as intended.

What is wrong with my cython implementation of erosion operation of mathematical morphology

I have produced a naive implementation of "erosion". The performance is not relevant since I just trying to understand the algorithm. However, the output of my implementation does not match the one I get from scipy.ndimage. What is wrong with my implementation ?
Here is my implementation with a small test case:
import numpy as np
from PIL import Image
# a small image to play with a cross structuring element
imgmat = np.array([
imgmat2 = np.where(imgmat == 0, 0, 255).astype(np.uint8)
imarr = Image.fromarray(imgmat2).resize((100, 200))
imarr = np.array(imgrrr)
imarr = np.where(imarr == 0, 0, 1)
se_mat3 = np.array([
se_mat31 = np.where(se_mat3 == 1, 0, 1)
The imarr is .
My implementation of erosion:
%%cython -a
import numpy as np
cimport numpy as cnp
cdef erosionC(cnp.ndarray[cnp.int_t, ndim=2] img,
cnp.ndarray[cnp.int_t, ndim=2] B, cnp.ndarray[cnp.int_t, ndim=2] X):
X: image coordinates
struct_element_mat: black and white image, black region is considered as the shape
of structuring element
This operation checks whether (B *includes* X) = $B \subset X$
as per defined in
Serra (Jean), « Introduction to mathematical morphology »,
Computer Vision, Graphics, and Image Processing,
vol. 35, nᵒ 3 (septembre 1986).
doi: 10.1016/0734-189X(86)90002-2
Consulted le 6 août 2020, p. 283‑305.
cdef cnp.ndarray[cnp.int_t, ndim=1] a, x, bx
cdef cnp.ndarray[cnp.int_t, ndim=2] Bx, B_frame, Xcp, b
cdef bint check
a = B[0] # get an anchor point from the structuring element coordinates
B_frame = B - a # express the se element coordinates in with respect to anchor point
Xcp = X.copy()
b = img.copy()
for x in X: # X contains the foreground coordinates in the image
Bx = B_frame + x # translate relative coordinates with respect to foreground coordinates considering it as the anchor point
check = True # this is erosion so if any of the se coordinates is not in foreground coordinates we consider it a miss
for bx in Bx: # Bx contains all the translated coordinates of se
if bx not in Xcp:
check = False
if check:
b[x[0], x[1]] = 1 # if there is a hit
b[x[0], x[1]] = 0 # if there is no hit
return b
def erosion(img: np.ndarray, struct_el_mat: np.ndarray, foregroundValue = 0):
B = np.argwhere(struct_el_mat == 0)
X = np.argwhere(img == foregroundValue)
nimg = erosionC(img, B, X)
return np.where(nimg == 1, 255, 0)
The calling code for both is:
from scipy import ndimage as nd
err = nd.binary_erosion(imarr, se_mat3)
imerrCustom = erosion(imarr, se_mat31, foregroundValue=1)
err produces
imerrCustom produces
In the end, I am still not sure about it, but after having read several papers more, I assume that my interpretation of X as foreground coordinates was an error. It should have probably been the entire image that is being iterated.
As I have stated I am not sure if this interpretation is correct as well. But I made a new implementation which iterates over the image, and it gives a more plausible result. I am sharing it in here, hoping that it might help someone:
%%cython -a
import numpy as np
cimport numpy as cnp
cdef dilation_c(cnp.ndarray[cnp.uint8_t, ndim=2] X,
cnp.ndarray[cnp.uint8_t, ndim=2] SE):
X: boolean image
SE: structuring element matrix
origin: coordinate of the origin of the structuring element
This operation checks whether (B *hits* X) = $B \cap X \not = \emptyset$
as per defined in
Serra (Jean), « Introduction to mathematical morphology »,
Computer Vision, Graphics, and Image Processing,
vol. 35, nᵒ 3 (septembre 1986).
doi: 10.1016/0734-189X(86)90002-2
Consulted le 6 août 2020, p. 283‑305.
The algorithm adapts DILDIRECT of
Najman (Laurent) et Talbot (Hugues),
Mathematical morphology: from theory to applications,
2013. ISBN : 9781118600788, p. 329
to the formula given in
Jähne (Bernd),
Digital image processing,
6th rev. and ext. ed, Berlin ; New York,
2005. TA1637 .J34 2005.
ISBN : 978-3-540-24035-8.
cdef cnp.ndarray[cnp.uint8_t, ndim=2] O
cdef list elst
cdef int r, c, X_rows, X_cols, SE_rows, SE_cols, se_r, se_c
cdef cnp.ndarray[cnp.int_t, ndim=1] bp
cdef list conds
cdef bint check, b, p, cond
O = np.zeros_like(X)
X_rows, X_cols = X.shape[:2]
SE_rows, SE_cols = SE.shape[:2]
# a boolean convolution
for r in range(0, X_rows-SE_rows):
for c in range(0, X_cols - SE_cols):
conds = []
for se_r in range(SE_rows):
for se_c in range(SE_cols):
b = <bint>SE[se_r, se_c]
p = <bint>X[se_r+r, se_c+c]
conds.append(b and p)
O[r,c] = <cnp.uint8_t>any(conds)
return O
def dilation_erosion(
img: np.ndarray,
struct_el_mat: np.ndarray,
foregroundValue: int = 1,
isErosion: bool = False):
img: image matrix
struct_el: NxN mesh grid of the structuring element whose center is SE's origin
structuring element is encoded as 1
foregroundValue: value to be considered as foreground in the image
B = struct_el_mat.astype(np.uint8)
if isErosion:
X = np.where(img == foregroundValue, 0, 1).astype(np.uint8)
X = np.where(img == foregroundValue, 1, 0).astype(np.uint8)
nimg = dilation_c(X, B)
foreground, background = (255, 0) if foregroundValue == 1 else (0, 1)
if isErosion:
return np.where(nimg == 1, background, foreground).astype(np.uint8)
return np.where(nimg == 1, foreground, background).astype(np.uint8)
# return nimg

Unravel Index loops forever

I am doing some work using image processing and sparse coding. Problem is, the following code works only on some images.
Here is the image that it works perfectly on:
And here is the image that it loops forever on:
Here is the code:
import cv2
import numpy as np
import networkx as nx
from preproc import Preproc
# From
def sparsify(bu_msg, suppress_radius=3):
"""Make a sparse representation of the edges by greedily selecting features from the
output of preprocessing layer and suppressing overlapping activations.
bu_msg : 3D numpy.ndarray of float
The bottom-up messages from the preprocessing layer.
Shape is (num_feats, rows, cols)
suppress_radius : int
How many pixels in each direction we assume this filter
explains when included in the sparsification.
frcs : see train_image.
frcs = []
img = bu_msg.max(0) > 0
while True:
r, c = np.unravel_index(img.argmax(), img.shape)
print(r, c)
if not img[r, c]:
frcs.append((bu_msg[:, r, c].argmax(), r, c))
img[r - suppress_radius:r + suppress_radius + 1,
c - suppress_radius:c + suppress_radius + 1] = False
return np.array(frcs)
if __name__ == '__main__':
img = cv2.imread('', 0)
img2 = cv2.imread('', 0)
prp = Preproc()
bu_msg = prp.fwd_infer(img)
frcs = sparsify(bu_msg)
and the accompanying preprocessing code:
"""A pre-processing layer of the RCN model. See Sec S8.1 for details.
import numpy as np
from scipy.ndimage import maximum_filter
from scipy.ndimage.filters import gaussian_filter
from scipy.signal import fftconvolve
class Preproc(object):
A simplified preprocessing layer implementing Gabor filters and suppression.
num_orients : int
Number of edge filter orientations (over 2pi).
filter_scale : float
A scale parameter for the filters.
cross_channel_pooling : bool
Whether to pool across neighboring orientation channels (cf. Sec S8.1.4).
filters : [numpy.ndarray]
Kernels for oriented Gabor filters.
pos_filters : [numpy.ndarray]
Kernels for oriented Gabor filters with all-positive values.
suppression_masks : numpy.ndarray
Masks for oriented non-max suppression.
def __init__(self,
self.num_orients = num_orients
self.filter_scale = filter_scale
self.cross_channel_pooling = cross_channel_pooling
self.suppression_masks = generate_suppression_masks(filter_scale=filter_scale,
def fwd_infer(self, img, brightness_diff_threshold=18.):
"""Compute bottom-up (forward) inference.
img : numpy.ndarray
The input image.
brightness_diff_threshold : float
Brightness difference threshold for oriented edges.
bu_msg : 3D numpy.ndarray of float
The bottom-up messages from the preprocessing layer.
Shape is (num_feats, rows, cols)
filtered = np.zeros((len(self.filters),) + img.shape, dtype=np.float32)
for i, kern in enumerate(self.filters):
filtered[i] = fftconvolve(img, kern, mode='same')
localized = local_nonmax_suppression(filtered, self.suppression_masks)
# Threshold and binarize
localized *= (filtered / brightness_diff_threshold).clip(0, 1)
localized[localized < 1] = 0
if self.cross_channel_pooling:
pooled_channel_weights = [(0, 1), (-1, 1), (1, 1)]
pooled_channels = [-np.ones_like(sf) for sf in localized]
for i, pc in enumerate(pooled_channels):
for channel_offset, factor in pooled_channel_weights:
ch = (i + channel_offset) % self.num_orients
pos_chan = localized[ch]
if factor != 1:
pos_chan[pos_chan > 0] *= factor
np.maximum(pc, pos_chan, pc)
bu_msg = np.array(pooled_channels)
bu_msg = localized
# Setting background to -1
bu_msg[bu_msg == 0] = -1.
return bu_msg
def filters(self):
return get_gabor_filters(
filter_scale=self.filter_scale, num_orients=self.num_orients, weights=False)
def pos_filters(self):
return get_gabor_filters(
filter_scale=self.filter_scale, num_orients=self.num_orients, weights=True)
def get_gabor_filters(size=21, filter_scale=4., num_orients=16, weights=False):
"""Get Gabor filter bank. See Preproc for parameters and returns."""
def _get_sparse_gaussian():
"""Sparse Gaussian."""
size = 2 * np.ceil(np.sqrt(2.) * filter_scale) + 1
alt = np.zeros((int(size), int(size)), np.float32)
alt[int(size // 2), int(size // 2)] = 1
gaussian = gaussian_filter(alt, filter_scale / np.sqrt(2.), mode='constant')
gaussian[gaussian < 0.05 * gaussian.max()] = 0
return gaussian
gaussian = _get_sparse_gaussian()
filts = []
for angle in np.linspace(0., 2 * np.pi, num_orients, endpoint=False):
acts = np.zeros((size, size), np.float32)
x, y = np.cos(angle) * filter_scale, np.sin(angle) * filter_scale
acts[int(size / 2 + y), int(size / 2 + x)] = 1.
acts[int(size / 2 - y), int(size / 2 - x)] = -1.
filt = fftconvolve(acts, gaussian, mode='same')
filt /= np.abs(filt).sum() # Normalize to ensure the maximum output is 1
if weights:
filt = np.abs(filt)
return filts
def generate_suppression_masks(filter_scale=4., num_orients=16):
Generate the masks for oriented non-max suppression at the given filter_scale.
See Preproc for parameters and returns.
size = 2 * int(np.ceil(filter_scale * np.sqrt(2))) + 1
cx, cy = size // 2, size // 2
filter_masks = np.zeros((num_orients, size, size), np.float32)
# Compute for orientations [0, pi), then flip for [pi, 2*pi)
for i, angle in enumerate(np.linspace(0., np.pi, num_orients // 2, endpoint=False)):
x, y = np.cos(angle), np.sin(angle)
for r in range(1, int(np.sqrt(2) * size / 2)):
dx, dy = round(r * x), round(r * y)
if abs(dx) > cx or abs(dy) > cy:
filter_masks[i, int(cy + dy), int(cx + dx)] = 1
filter_masks[i, int(cy - dy), int(cx - dx)] = 1
filter_masks[num_orients // 2:] = filter_masks[:num_orients // 2]
return filter_masks
def local_nonmax_suppression(filtered, suppression_masks, num_orients=16):
Apply oriented non-max suppression to the filters, so that only a single
orientated edge is active at a pixel. See Preproc for additional parameters.
filtered : numpy.ndarray
Output of filtering the input image with the filter bank.
Shape is (num feats, rows, columns).
localized : numpy.ndarray
Result of oriented non-max suppression.
localized = np.zeros_like(filtered)
cross_orient_max = filtered.max(0)
filtered[filtered < 0] = 0
for i, (layer, suppress_mask) in enumerate(zip(filtered, suppression_masks)):
competitor_maxs = maximum_filter(layer, footprint=suppress_mask, mode='nearest')
localized[i] = competitor_maxs <= layer
localized[cross_orient_max > filtered] = 0
return localized
The problem I found was that np.unravel_index returns all the positions of features for the first image, whereas it only returns (0, 0) continuously for the second. My hypothesis is that it is either a problem with the preprocessing code, or it is a bug in the np.unravel_index function itself, but I am not too sure.
Okay, so turns out there is an underlying problem when calling argmax on the image. I rewrote the sparsification script to not use argmax and it works exactly the same. It should now work with any image.
def sparsify(bu_msg, suppress_radius=3):
"""Make a sparse representation of the edges by greedily selecting features from the
output of preprocessing layer and suppressing overlapping activations.
bu_msg : 3D numpy.ndarray of float
The bottom-up messages from the preprocessing layer.
Shape is (num_feats, rows, cols)
suppress_radius : int
How many pixels in each direction we assume this filter
explains when included in the sparsification.
frcs : see train_image.
frcs = []
img = bu_msg.max(0) > 0
for (r, c), _ in np.ndenumerate(img):
if img[r, c]:
frcs.append((bu_msg[:, r, c].argmax(), r, c))
img[r - suppress_radius:r + suppress_radius + 1,
c - suppress_radius:c + suppress_radius + 1] = False
return np.array(frcs)

PyOpenCL reduction Kernel on each pixel of image as array instead of each byte (RGB mode, 24 bits )

I'm trying to calculate the average Luminance of an RGB image. To do this, I find the luminance of each pixel i.e.
L(r,g,b) = X*r + Y*g + Z*b (some linear combination).
And then find the average by summing up luminance of all pixels and dividing by width*height.
To speed this up, I'm using pyopencl.reduction.ReductionKernel
The array I pass to it is a Single Dimension Numpy Array so it works just like the example given.
import Image
import numpy as np
im ='image_00000001.bmp')
data = np.asarray(im).reshape(-1) # so data is a single dimension list
# data.dtype is uint8, data.shape is (w*h*3, )
I want to incorporate the following code from the example into it . i.e. I would make changes to datatype and the type of arrays I'm passing. This is the example:
a = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
b = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
krnl = ReductionKernel(ctx, numpy.float32, neutral="0",
reduce_expr="a+b", map_expr="x[i]*y[i]",
arguments="__global float *x, __global float *y")
my_dot_prod = krnl(a, b).get()
Except, my map_expr will work on each pixel and convert each pixel to its luminance value.
And reduce expr remains the same.
The problem is, it works on each element in the array, and I need it to work on each pixel which is 3 consecutive elements at a time (RGB ).
One solution is to have three different arrays, one for R, one for G and one for B ,which would work, but is there another way ?
Edit: I changed the program to illustrate the char4 usage instead of float4:
import numpy as np
import pyopencl as cl
import pyopencl.array as cl_array
deviceID = 0
platformID = 0
N = 10
testData = np.zeros(N, dtype=cl_array.vec.char4)
dev = cl.get_platforms()[platformID].get_devices()[deviceID]
ctx = cl.Context([dev])
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
Data_In = cl.Buffer(ctx, mf.READ_WRITE, testData.nbytes)
prg = cl.Program(ctx, """
__kernel void Pack_Cmplx( __global char4* Data_In, int N)
int gid = get_global_id(0);
//Data_In[gid] = 1; // This would change all components to one
Data_In[gid].x = 1; // changing single component
Data_In[gid].y = 2;
Data_In[gid].z = 3;
Data_In[gid].w = 4;
prg.Pack_Cmplx(queue, (N,1), workGroup, Data_In, np.int32(N))
cl.enqueue_copy(queue, testData, Data_In)
print testData
I hope it helps.