pycuda - memcpy_dtoh, not giving what appears to have been set - numpy

I have a very simple function where I'm passing in a char array and doing a simple character match. I want to return an array of 1/0 depending on which characters are matched.
Problem: although I can see the value has been set in the data structure (as I print it in the function after it's assigned) when the int array is copied back from the device the values aren't as expected.
I'm sure it's something silly.
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
mod = SourceModule("""
__global__ void test(const char *q, const int chrSize, int *d, const int intSize) {
int v = 0;
if( q[threadIdx.x * chrSize] == 'a' || q[threadIdx.x * chrSize] == 'c' ) {
v = 1;
}
d[threadIdx.x * intSize] = v;
printf("x=%d, y=%d, val=%c ret=%d\\n", threadIdx.x, threadIdx.y, q[threadIdx.x * chrSize], d[threadIdx.x * intSize]);
}
""")
func = mod.get_function("test")
# input data
a = np.asarray(['a','b','c','d'], dtype=np.str_)
# allocate/copy to device
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
# destination array
d = np.zeros((4), dtype=np.int16)
# allocate/copy to device
d_gpu = cuda.mem_alloc(d.nbytes)
cuda.memcpy_htod(d_gpu, d)
# run the function
func(a_gpu, np.int8(a.dtype.itemsize), d_gpu, np.int8(d.dtype.itemsize), block=(4,1,1))
# copy data back and priint
cuda.memcpy_dtoh(d, d_gpu)
print(d)
Output:
x=0, y=0, val=a ret=1
x=1, y=0, val=b ret=0
x=2, y=0, val=c ret=1
x=3, y=0, val=d ret=0
[1 0 0 0]
Expected output:
x=0, y=0, val=a ret=1
x=1, y=0, val=b ret=0
x=2, y=0, val=c ret=1
x=3, y=0, val=d ret=0
[1 0 1 0]

You have two main problems, neither of which have anything to do with memcpy_dtoh:
You have declared d and d_gpu as dtype np.int16, but the kernel is expecting C++ int, leading to a type mistmatch. You should use the np.int32 type to define the arrays.
The indexing of d within the kernel is incorrect. If you have declared the array to the compiler as a 32 bit type, indexing the array as d[threadIdx.x] will automatically include the correct alignment for the type. Passing and using intSize to the kernel for indexing d is not required and it is incorrect to do so.
If you fix those two issues, I suspect the code will work as intended.

Related

How to invoke a kernel which has a double pointer as argument?

I'm trying to invoke a custom kernel with a double pointer as argument using CuPy.
Given a dataset of size n of d-dimensional points, I allocate an array of n floats for each dimension.
Then I put all the pointers to those arrays into another array. That's why I'm dealing with a double pointer.
My custom kernel has a signature like:
__global__ void myKernel(float **dataset, int n, int d, int* output, ...)
In native cuda code (compiled through nvcc) my code works as expected.
My python attempt to invoke the kernel looks like:
import cupy as cp
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1_000, n_features=2, cluster_std=.5, center_box=(- 10.0, 10.0), shuffle=True, random_state=None)
with open('path', 'r') as f:
code = f.read()
output = cp.empty(len(X), dtype=cp.int32)
kern = cp.RawKernel(code, 'myKernel')
d1 = cp.array(X[:, 0])
d2 = cp.array(X[:, 1])
blocks = ...
grid = ...
args = (cp.array([d1.data.ptr, d2.data.ptr]), 2, len(X), output, ...)
shared_mem = ...
kern((grid, 1, 1), (blocks, 1, 1), args=args, shared_mem=shared_mem)
cp.cuda.Stream.null.synchronize()
As you can probably spot, my issue is in the args = ... row.
The code does not raise errors, but the data in the output vector is definitely wrong.
Am I passing the data properly? Is there a better way?
Could you try creating a minimum reproducer? I see no problem passing a double-pointer argument.
import cupy as cp
import sys
code = '''
extern "C" __global__ void myKernel(float **dataset, int n, int* output) {
for (int i = 0; i < n; ++i) {
printf("[");
for (int j = 0; j < 10; ++j) {
printf("%f, ", dataset[i][j]);
}
printf("]\\n");
}
}
'''
kern = cp.RawKernel(code, 'myKernel')
d0 = cp.arange(10, dtype=cp.float32)
d1 = cp.arange(10, dtype=cp.float32) * 2
dataset = cp.array([d0.data.ptr, d1.data.ptr])
output = cp.empty(10, dtype=cp.int32)
def run():
print('--expected--')
print(d0)
print(d1)
print('--actual--')
sys.stdout.flush()
kern((1, 1, 1), (1, 1, 1), args=(dataset, len(dataset), output))
cp.cuda.Stream.null.synchronize()
print('--end--')
print()
run()
Output:
--expected--
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
[ 0. 2. 4. 6. 8. 10. 12. 14. 16. 18.]
--actual--
[0.000000, 1.000000, 2.000000, 3.000000, 4.000000, 5.000000, 6.000000, 7.000000, 8.000000, 9.000000, ]
[0.000000, 2.000000, 4.000000, 6.000000, 8.000000, 10.000000, 12.000000, 14.000000, 16.000000, 18.000000, ]
--end--

How to relate kernel input data structure in CUDA kernel function with parameter input in pycuda

I am writing a cuda kernel to convert rgba image to gray scale image in pycuda, here is the PyCUDA code:
import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
kernel = SourceModule("""
#include <stdio.h>
__global__ void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int y = threadIdx.y+ blockIdx.y* blockDim.y;
int x = threadIdx.x+ blockIdx.x* blockDim.x;
if (y < numCols && x < numRows) {
int index = numRows*y +x;
uchar4 color = rgbaImage[index];
unsigned char grey = (unsigned char)(0.299f*color.x+ 0.587f*color.y +
0.114f*color.z);
greyImage[index] = grey;
}
}
""")
However, the problem is how to relate uchar4* to numpy array. I know can modify my kernel function to accept int* or float*, and make it work. But I just wonder how to make the above kernel function to work in pycuda.
Below is host code.
def gpu_rgb2gray(image):
shape = image.shape
n_rows, n_cols, _ = np.array(shape, dtype=np.int)
image_gray = np.empty((n_rows, n_cols), dtype= np.int)
## HERE is confusing part, how to rearrange image to match unchar4* ??
image = image.reshape(1, -1, 4)
# Get kernel function
rgba2gray = kernel.get_function("rgba_to_greyscale")
# Define block, grid and compute
blockDim = (32, 32, 1) # 1024 threads in total
dx, mx = divmod(shape[1], blockDim[0])
dy, my = divmod(shape[0], blockDim[1])
gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
# Kernel function
# HERE doesn't work because of mismatch
rgba2gray (
cuda.In(image), cuda.Out(image_gray), n_rows, n_cols,
block=blockDim, grid=gridDim)
return image_gray
Anyone have any ideas? Thanks!
The gpuarray class has native support for CUDA's built in vector types (including uchar4).
So you can create as gpuarray instance with the correct dtype for the kernel, and copy the host image to that gpuarray using buffers, then use the gpuarray as the kernel input argument. As an example (and if I understood your code correctly), something like this should probably work:
import pycuda.gpuarray as gpuarray
....
def gpu_rgb2gray(image):
shape = image.shape
image_rgb = gpuarray.empty(shape, dtype=gpuarray.vec.uchar4)
cuda.memcpy_htod(image_rgb.gpudata, image.data)
image_gray = gpuarray.empty(shape, dtype=np.uint8)
# Get kernel function
rgba2gray = kernel.get_function("rgba_to_greyscale")
# Define block, grid and compute
blockDim = (32, 32, 1) # 1024 threads in total
dx, mx = divmod(shape[1], blockDim[0])
dy, my = divmod(shape[0], blockDim[1])
gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
rgba2gray ( image_rgb, image_gray, np.int32(shape[0]), np.int32(shape[1]), block=blockDim, grid=gridDim)
img_gray = np.array(image_gray.get(), dtype=np.int)
return img_gray
this would take an image of 32 bit unsigned integers and copy them to an array of uchar4 on the GPU and then upcast the resulting array of uchar back to integers on the device.

Cython Typing List of Strings

I'm trying to use cython to improve the performance of a loop, but I'm running
into some issues declaring the types of the inputs.
How do I include a field in my typed struct which is a string that can be
either 'front' or 'back'
I have a np.recarray that looks like the following (note the length of the
recarray is unknown as compile time)
import numpy as np
weights = np.recarray(4, dtype=[('a', np.int64), ('b', np.str_, 5), ('c', np.float64)])
weights[0] = (0, "front", 0.5)
weights[1] = (0, "back", 0.5)
weights[2] = (1, "front", 1.0)
weights[3] = (1, "back", 0.0)
as well as inputs of a list of strings and a pandas.Timestamp
import pandas as pd
ts = pd.Timestamp("2015-01-01")
contracts = ["CLX16", "CLZ16"]
I am trying to cythonize the following loop
def ploop(weights, contracts, timestamp):
cwts = []
for gen_num, position, weighting in weights:
if weighting != 0:
if position == "front":
cntrct_idx = gen_num
elif position == "back":
cntrct_idx = gen_num + 1
else:
raise ValueError("transition.columns must contain "
"'front' or 'back'")
cwts.append((gen_num, contracts[cntrct_idx], weighting, timestamp))
return cwts
My attempt involved typing the weights input as a struct in cython,
in a file struct_test.pyx as follows
import numpy as np
cimport numpy as np
cdef packed struct tstruct:
np.int64_t gen_num
char[5] position
np.float64_t weighting
def cloop(tstruct[:] weights_array, contracts, timestamp):
cdef tstruct weights
cdef int i
cdef int cntrct_idx
cwts = []
for k in xrange(len(weights_array)):
w = weights_array[k]
if w.weighting != 0:
if w.position == "front":
cntrct_idx = w.gen_num
elif w.position == "back":
cntrct_idx = w.gen_num + 1
else:
raise ValueError("transition.columns must contain "
"'front' or 'back'")
cwts.append((w.gen_num, contracts[cntrct_idx], w.weighting,
timestamp))
return cwts
But I am receiving runtime errors, which I believe are related to the
char[5] position.
import pyximport
pyximport.install()
import struct_test
struct_test.cloop(weights, contracts, ts)
ValueError: Does not understand character buffer dtype format string ('w')
In addition I am a bit unclear how I would go about typing contracts as well
as timestamp.
Your ploop (without the timestamp variable) produces:
In [226]: ploop(weights, contracts)
Out[226]: [(0, 'CLX16', 0.5), (0, 'CLZ16', 0.5), (1, 'CLZ16', 1.0)]
Equivalent function without a loop:
def ploopless(weights, contracts):
arr_contracts = np.array(contracts) # to allow array indexing
wgts1 = weights[weights['c']!=0]
mask = wgts1['b']=='front'
wgts1['b'][mask] = arr_contracts[wgts1['a'][mask]]
mask = wgts1['b']=='back'
wgts1['b'][mask] = arr_contracts[wgts1['a'][mask]+1]
return wgts1.tolist()
In [250]: ploopless(weights, contracts)
Out[250]: [(0, 'CLX16', 0.5), (0, 'CLZ16', 0.5), (1, 'CLZ16', 1.0)]
I'm taking advantage of the fact that returned list of tuples has same (int, str, int) layout as the input weight array. So I'm just making a copy of weights and replacing selected values of the b field.
Note that I use the field selection index before the mask one. The boolean mask produces a copy, so we have to careful about indexing order.
I'm guessing that loop-less array version will be competitive in time with the cloop (on realistic arrays). The string and list operations in cloop probably limit its speedup.

Switch on argument type

Using Open SCAD, I have a module that, like cube(), has a size parameter that can be a single value or a vector of three values. Ultimately, I want a vector of three values.
If the caller passes a single value, I'd like all three values of the vector to be the same. I don't see anything in the language documentation about detecting the type of an argument. So I came up with this hack:
module my_cubelike_thing(size=1) {
dimensions = concat(size, size, size);
width = dimensions[0];
length = dimensions[1];
height = dimensions[2];
// ... use width, length, and height ...
}
When size is a single value, the result of the concat is exactly what I want: three copies of the value.
When size is a three-value vector, the result of the concat is nine-value vector, and my code just ignores the last six values.
It works but only because what I want in the single value case is to replicate the value. Is there a general way to switch on the argument type and do different things depending on that type?
If type of size only can be single value or a vector with 3 values, the type can helpwise be found by the special value undef:
a = [3,5,8];
// a = 5;
if (a[0] == undef) {
dimensions = concat(a, a, a);
// do something
cube(size=dimensions,center=false);
}
else {
dimensions = a;
// do something
cube(size=dimensions,center=false);
}
But assignments are only valid in the scope in which they are defined , documnetation of openscad.
So in each subtree much code is needed and i would prefere to validate the type of size in an external script (e.g. python3) and write the openscad-code with the assignment of variables to a file, which can be included in the openscad-file, here my short test-code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
# size = 20
size = [20,15,10]
if type(size) == int:
dimensions = [size, size, size]
elif type(size) == list:
dimensions = size
else:
# if other types possible
pass
with open('variablen.scad', 'w') as wObj:
for i, v in enumerate(['l', 'w', 'h']):
wObj.write('{} = {};\n'.format(v, dimensions[i]))
os.system('openscad ./typeDef.scad')
content of variablen.scad:
l = 20;
w = 15;
h = 10;
and typeDef.scad can look like this
include <./variablen.scad>;
module my_cubelike_thing() {
linear_extrude(height=h, center=false) square(l, w);
}
my_cubelike_thing();

PyOpenCL reduction Kernel on each pixel of image as array instead of each byte (RGB mode, 24 bits )

I'm trying to calculate the average Luminance of an RGB image. To do this, I find the luminance of each pixel i.e.
L(r,g,b) = X*r + Y*g + Z*b (some linear combination).
And then find the average by summing up luminance of all pixels and dividing by width*height.
To speed this up, I'm using pyopencl.reduction.ReductionKernel
The array I pass to it is a Single Dimension Numpy Array so it works just like the example given.
import Image
import numpy as np
im = Image.open('image_00000001.bmp')
data = np.asarray(im).reshape(-1) # so data is a single dimension list
# data.dtype is uint8, data.shape is (w*h*3, )
I want to incorporate the following code from the example into it . i.e. I would make changes to datatype and the type of arrays I'm passing. This is the example:
a = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
b = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
krnl = ReductionKernel(ctx, numpy.float32, neutral="0",
reduce_expr="a+b", map_expr="x[i]*y[i]",
arguments="__global float *x, __global float *y")
my_dot_prod = krnl(a, b).get()
Except, my map_expr will work on each pixel and convert each pixel to its luminance value.
And reduce expr remains the same.
The problem is, it works on each element in the array, and I need it to work on each pixel which is 3 consecutive elements at a time (RGB ).
One solution is to have three different arrays, one for R, one for G and one for B ,which would work, but is there another way ?
Edit: I changed the program to illustrate the char4 usage instead of float4:
import numpy as np
import pyopencl as cl
import pyopencl.array as cl_array
deviceID = 0
platformID = 0
workGroup=(1,1)
N = 10
testData = np.zeros(N, dtype=cl_array.vec.char4)
dev = cl.get_platforms()[platformID].get_devices()[deviceID]
ctx = cl.Context([dev])
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
Data_In = cl.Buffer(ctx, mf.READ_WRITE, testData.nbytes)
prg = cl.Program(ctx, """
__kernel void Pack_Cmplx( __global char4* Data_In, int N)
{
int gid = get_global_id(0);
//Data_In[gid] = 1; // This would change all components to one
Data_In[gid].x = 1; // changing single component
Data_In[gid].y = 2;
Data_In[gid].z = 3;
Data_In[gid].w = 4;
}
""").build()
prg.Pack_Cmplx(queue, (N,1), workGroup, Data_In, np.int32(N))
cl.enqueue_copy(queue, testData, Data_In)
print testData
I hope it helps.