bsearch in Cython - numpy

I'm learning cython's libc.bsearch for trying to use Cython to get an index in a sorted array. The example is from this question with modifications:
## test_bsearch.pyx
cimport cython
from libc.stdlib cimport bsearch
cdef int comp_fun(const void *a, const void *b) nogil:
cdef int a_v = (<int*>a)[0]
cdef int b_v = (<int*>b)[0]
if a_v < b_v:
return -1
elif a_v > b_v:
return 1
else:
return 0
def bsearch_c(int[::1] t, int v):
cdef int *p = <int*> bsearch(&v, &t[0], t.shape[0], sizeof(int), &comp_fun)
cdef int j = <int> p
if p != NULL:
return j
else:
return -1
I then created a setup.py:
from distutils.core import setup
from Cython.Build import cythonize
setup(
ext_modules=cythonize([
"test_bsearch.pyx"
],
compiler_directives={'language_level': "3"}
),
include_dirs=[
np.get_include()
]
)
And compiled the code in Win10 in command prompt: python setup.py build_ext -i. But running it as follows got a strange result:
>>> from test_bsearch import bsearch_c
>>> import numpy as np
>>> x = np.arange(20, dtype=np.int32)
>>> bsearch_c(x, 5) # got 610183044
I know nothing about C++, so can't figure out what's wrong with above implementation. How to correct?

cdef int j = <int> p
This is casting a pointer to an int. You want
cdef int j = p[0]

Related

Cython passing int numpy array to C++

First, I know this question appears similar to this one but they are different. I'm struggling trying to pass int (int32) numpy array to C++ via Cython without copying. The files:
doit.cpp:
#include "doit.h"
void run(int *x) {}
doit.h:
#ifndef _DOIT_H_
#define _DOIT_H_
void run(int *);
#endif
q.pyx:
cimport numpy as np
import numpy as np
cdef extern from "doit.h":
void run(int* X)
def pyrun(np.ndarray[np.int_t, ndim=1] X):
X = np.ascontiguousarray(X)
run(&X[0])
I compile with Cython. The error is:
Error compiling Cython file:
------------------------------------------------------------
...
cdef extern from "doit.h":
void run(int* X)
def pyrun(np.ndarray[np.int_t, ndim=1] X):
X = np.ascontiguousarray(X)
run(&X[0])
^
------------------------------------------------------------
py_cpp/q.pyx:9:8: Cannot assign type 'int_t *' to 'int *'
However, if I replace all occurrences of int to double (e.g. int *x to double *x, int_t to double_t), then all errors are gone.
How to solve the problem? Thanks in advance.

How to invoke a kernel which has a double pointer as argument?

I'm trying to invoke a custom kernel with a double pointer as argument using CuPy.
Given a dataset of size n of d-dimensional points, I allocate an array of n floats for each dimension.
Then I put all the pointers to those arrays into another array. That's why I'm dealing with a double pointer.
My custom kernel has a signature like:
__global__ void myKernel(float **dataset, int n, int d, int* output, ...)
In native cuda code (compiled through nvcc) my code works as expected.
My python attempt to invoke the kernel looks like:
import cupy as cp
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1_000, n_features=2, cluster_std=.5, center_box=(- 10.0, 10.0), shuffle=True, random_state=None)
with open('path', 'r') as f:
code = f.read()
output = cp.empty(len(X), dtype=cp.int32)
kern = cp.RawKernel(code, 'myKernel')
d1 = cp.array(X[:, 0])
d2 = cp.array(X[:, 1])
blocks = ...
grid = ...
args = (cp.array([d1.data.ptr, d2.data.ptr]), 2, len(X), output, ...)
shared_mem = ...
kern((grid, 1, 1), (blocks, 1, 1), args=args, shared_mem=shared_mem)
cp.cuda.Stream.null.synchronize()
As you can probably spot, my issue is in the args = ... row.
The code does not raise errors, but the data in the output vector is definitely wrong.
Am I passing the data properly? Is there a better way?
Could you try creating a minimum reproducer? I see no problem passing a double-pointer argument.
import cupy as cp
import sys
code = '''
extern "C" __global__ void myKernel(float **dataset, int n, int* output) {
for (int i = 0; i < n; ++i) {
printf("[");
for (int j = 0; j < 10; ++j) {
printf("%f, ", dataset[i][j]);
}
printf("]\\n");
}
}
'''
kern = cp.RawKernel(code, 'myKernel')
d0 = cp.arange(10, dtype=cp.float32)
d1 = cp.arange(10, dtype=cp.float32) * 2
dataset = cp.array([d0.data.ptr, d1.data.ptr])
output = cp.empty(10, dtype=cp.int32)
def run():
print('--expected--')
print(d0)
print(d1)
print('--actual--')
sys.stdout.flush()
kern((1, 1, 1), (1, 1, 1), args=(dataset, len(dataset), output))
cp.cuda.Stream.null.synchronize()
print('--end--')
print()
run()
Output:
--expected--
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
[ 0. 2. 4. 6. 8. 10. 12. 14. 16. 18.]
--actual--
[0.000000, 1.000000, 2.000000, 3.000000, 4.000000, 5.000000, 6.000000, 7.000000, 8.000000, 9.000000, ]
[0.000000, 2.000000, 4.000000, 6.000000, 8.000000, 10.000000, 12.000000, 14.000000, 16.000000, 18.000000, ]
--end--

How to relate kernel input data structure in CUDA kernel function with parameter input in pycuda

I am writing a cuda kernel to convert rgba image to gray scale image in pycuda, here is the PyCUDA code:
import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
kernel = SourceModule("""
#include <stdio.h>
__global__ void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
int y = threadIdx.y+ blockIdx.y* blockDim.y;
int x = threadIdx.x+ blockIdx.x* blockDim.x;
if (y < numCols && x < numRows) {
int index = numRows*y +x;
uchar4 color = rgbaImage[index];
unsigned char grey = (unsigned char)(0.299f*color.x+ 0.587f*color.y +
0.114f*color.z);
greyImage[index] = grey;
}
}
""")
However, the problem is how to relate uchar4* to numpy array. I know can modify my kernel function to accept int* or float*, and make it work. But I just wonder how to make the above kernel function to work in pycuda.
Below is host code.
def gpu_rgb2gray(image):
shape = image.shape
n_rows, n_cols, _ = np.array(shape, dtype=np.int)
image_gray = np.empty((n_rows, n_cols), dtype= np.int)
## HERE is confusing part, how to rearrange image to match unchar4* ??
image = image.reshape(1, -1, 4)
# Get kernel function
rgba2gray = kernel.get_function("rgba_to_greyscale")
# Define block, grid and compute
blockDim = (32, 32, 1) # 1024 threads in total
dx, mx = divmod(shape[1], blockDim[0])
dy, my = divmod(shape[0], blockDim[1])
gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
# Kernel function
# HERE doesn't work because of mismatch
rgba2gray (
cuda.In(image), cuda.Out(image_gray), n_rows, n_cols,
block=blockDim, grid=gridDim)
return image_gray
Anyone have any ideas? Thanks!
The gpuarray class has native support for CUDA's built in vector types (including uchar4).
So you can create as gpuarray instance with the correct dtype for the kernel, and copy the host image to that gpuarray using buffers, then use the gpuarray as the kernel input argument. As an example (and if I understood your code correctly), something like this should probably work:
import pycuda.gpuarray as gpuarray
....
def gpu_rgb2gray(image):
shape = image.shape
image_rgb = gpuarray.empty(shape, dtype=gpuarray.vec.uchar4)
cuda.memcpy_htod(image_rgb.gpudata, image.data)
image_gray = gpuarray.empty(shape, dtype=np.uint8)
# Get kernel function
rgba2gray = kernel.get_function("rgba_to_greyscale")
# Define block, grid and compute
blockDim = (32, 32, 1) # 1024 threads in total
dx, mx = divmod(shape[1], blockDim[0])
dy, my = divmod(shape[0], blockDim[1])
gridDim = ((dx + (mx>0)), (dy + (my>0)), 1)
rgba2gray ( image_rgb, image_gray, np.int32(shape[0]), np.int32(shape[1]), block=blockDim, grid=gridDim)
img_gray = np.array(image_gray.get(), dtype=np.int)
return img_gray
this would take an image of 32 bit unsigned integers and copy them to an array of uchar4 on the GPU and then upcast the resulting array of uchar back to integers on the device.

Making my cython code more efficient

I've written a python program which I try to cythonize.
Is there any suggestion how to make the for-loop more efficient, as this is taking 99% of the time?
This is the for-loop:
for i in range(l):
b1[i] = np.nanargmin(locator[i,:]) # Closer point
locator[i, b1[i]] = NAN # Do not consider Closer point
b2[i] = np.nanargmin(locator[i,:]) # 2nd Closer point
Adjacents[i,0] = np.array((Existed_Pips[b1[i]]), dtype=np.double)
Adjacents[i,1] = np.array((Existed_Pips[b2[i]]), dtype=np.double)
This is the rest of the code:
import numpy as np
cimport numpy as np
from libc.math cimport NAN #, isnan
def PIPs(np.ndarray[np.double_t, ndim=1, mode='c'] ys, unsigned int nofPIPs, unsigned int typeofdist):
cdef:
unsigned int currentstate, j, i
np.ndarray[np.double_t, ndim=1, mode="c"] D
np.ndarray[np.int64_t, ndim=1, mode="c"] Existed_Pips
np.ndarray[np.int_t, ndim=1, mode="c"] xs
np.ndarray[np.double_t, ndim=2] Adjacents, locator, Adjy, Adjx, Raw_Fire_PIPs, Raw_Fem_PIPs
np.ndarray[np.int_t, ndim=2, mode="c"] PIP_points, b1, b2
cdef unsigned int l = len(ys)
xs = np.arange(0,l, dtype=np.int) # Column vector with xs
PIP_points = np.zeros((l,1), dtype=np.int) # Binary indexation
PIP_points[0] = 1 # One indicate the PIP points.The first two PIPs are the first and the last observation.
PIP_points[-1] = 1
Adjacents = np.zeros((l,2), dtype=np.double)
currentstate = 2 # Initial PIPs
while currentstate <= nofPIPs: # for eachPIPs in range(nofPIPs)
Existed_Pips = np.flatnonzero(PIP_points)
currentstate = len(Existed_Pips)
locator = np.full((l,currentstate), NAN, dtype=np.double) #np.int*
for j in range(currentstate):
locator[:,j] = np.absolute(xs-Existed_Pips[j])
b1 = np.zeros((l,1), dtype=np.int)
b2 = np.zeros((l,1), dtype=np.int)
for i in range(l):
b1[i] = np.nanargmin(locator[i,:]) # Closer point
locator[i, b1[i]] = NAN # Do not consider Closer point
b2[i] = np.nanargmin(locator[i,:]) # 2nd Closer point
Adjacents[i,0] = np.array((Existed_Pips[b1[i]]), dtype=np.double)
Adjacents[i,1] = np.array((Existed_Pips[b2[i]]), dtype=np.double)
##Calculate Distance
Adjx = Adjacents
Adjy = np.array([ys[np.array(Adjacents[:,0], dtype=np.int)], ys[np.array(Adjacents[:,1], dtype=np.int)]]).transpose()
Adjx[Existed_Pips,:] = NAN # Existed PIPs are not candidates for new PIP.
Adjy[Existed_Pips,:] = NAN
if typeofdist == 1: #Euclidean Distance
##[D] = EDist(ys,xs,Adjx,Adjy)
ED = np.power(np.power((Adjx[:,1]-xs),2) + np.power((Adjy[:,1]-ys),2),(0.5)) + np.power(np.power((Adjx[:,0]-xs),2) + np.power((Adjy[:,0]-ys),2),(0.5))
EDmax = np.nanargmax(ED)
PIP_points[EDmax]=1
currentstate=currentstate+1
return np.array([Existed_Pips, ys[Existed_Pips]]).transpose()
A couple of suggestions:
Take the calls to np.nanargmin out of the loop (use the axis parameter to let you operate on the whole array at once. This reduces the number of Python function calls you have to make:
b1 = np.nanargmin(locator,axis=1)
locator[np.arange(locator.shape[0]),b1] = np.nan
b2 = np.nanargmin(locator,axis=1)
Your assignment to Adjacents is odd - you seem to be creating a length-1 array for the right-hand side first. Instead just do
Adjacents[i,0] = Existed_Pips[b1[i]]
# ...
However, in this case, you can also take both lines outside the loop, eliminating the entire loop:
Adjacents = np.vstack((Existing_Pips[b1], Existings_Pips[b2])).T
All of this is relying on numpy, rather than Cython, for the speed-up, but it probably beats your version.

Passing numpy integer array to c code

I'm trying to write Cython code to dump a dense feature matrix, target vector pair to libsvm format faster than sklearn's built in code. I get a compilation error complaining about a type issue with passing the target vector (a numpy array of ints) to the relevant c function.
Here's the code:
import numpy as np
cimport numpy as np
cimport cython
cdef extern from "cdump.h":
int filedump( double features[], int numexemplars, int numfeats, int target[], char* outfname)
#cython.boundscheck(False)
#cython.wraparound(False)
def fastdumpdense_libsvmformat(np.ndarray[np.double_t,ndim=2] X, y, outfname):
if X.shape[0] != len(y):
raise ValueError("X and y need to have the same number of points")
cdef int numexemplars = X.shape[0]
cdef int numfeats = X.shape[1]
cdef bytes py_bytes = outfname.encode()
cdef char* outfnamestr = py_bytes
cdef np.ndarray[np.double_t, ndim=2, mode="c"] X_c
cdef np.ndarray[np.int_t, ndim=1, mode="c"] y_c
X_c = np.ascontiguousarray(X, dtype=np.double)
y_c = np.ascontiguousarray(y, dtype=np.int)
retval = filedump( &X_c[0,0], numexemplars, numfeats, &y_c[0], outfnamestr)
return retval
When I attempt to compile this code using distutils, I get the error
cythoning fastdump_svm.pyx to fastdump_svm.cpp
Error compiling Cython file:
------------------------------------------------------------ ...
cdef np.ndarray[np.double_t, ndim=2, mode="c"] X_c
cdef np.ndarray[np.int_t, ndim=1, mode="c"] y_c
X_c = np.ascontiguousarray(X, dtype=np.double)
y_c = np.ascontiguousarray(y, dtype=np.int)
retval = filedump( &X_c[0,0], numexemplars, numfeats, &y_c[0], outfnamestr)
^
------------------------------------------------------------
fastdump_svm.pyx:24:58: Cannot assign type 'int_t *' to 'int *'
Any idea how to fix this error? I originally was following the paradigm of passing y_c.data, which works, but this is apparently not the recommended way.
You can also use dtype=np.dtype("i") when initiating a numpy array to match the C int on your machine.
cdef int [:] y_c
c_array = np.ascontiguousarray(y, dtype=np.dtype("i"))
The problem is that numpy.int_t is not the same as int, you can easily check this by having your program print sizeof(numpy.int_t) and sizeof(int).
int is a c int, defined by the c standard as being at least 16 bits, but it's 32 bits on my machine. numpy.int_t is usually 32 bits or 64 bits depending on whether you're using a 32 or 64 bit version of numpy, but of course there is some exception (probably for windows users). If you want to know which numpy dtype matches your c_int you can do np.dtype(cytpes.c_int).
So to pass your numpy array to c code you can do:
import ctypes
cdef np.ndarray[int, ndim=1, mode="c"] y_c
y_c = np.ascontiguousarray(y, dtype=ctypes.c_int)
retval = filedump( &X_c[0,0], numexemplars, numfeats, &y_c[0], outfnamestr)