Why does writing to memory take longer than reading in GPU? - optimization

Here is part of my code where I update some values and write it to another part of my memory
__kernel void update(
__global float4 *f,
__global float4 *p,
const unsigned int n,
__local float4 *sharedP){
float4 val = (float4)(0.f,0.f,0.f,0.f);
size_t lID = get_local_id(0);
size_t gID = get_global_id(0);
if(gID < n){
float4 p1 = p[gID];
for(size_t i = 0; i < n; ++i){
if(i!=gID){
float4 p2 = p1 - p[i];
//Some other calculations to finally update 'val'
}
}
f[gID] = val;
}
}
Here each thread reads from the global memory n+1 times, but writes back only once.(My initial thought was that, the bottleneck was due repeated call to the global memory, hence I had made some changes to copy a block of data into the shared memory, swap it and other things. Even this did not improve the performance a lot). However later I found that the bottleneck is due to f[gID] = val;. For example when the kernel is run 200 times with n = 8192, it takes 3.5s; but if I comment out f[gID] = val; it takes only 0.05s.
Code for other method
__kernel void update(
__global float4 *f,
__global float4 *p,
const unsigned int n,
__local float4 *sharedP){
float4 val = (float4)(0.f,0.f,0.f,0.f);
size_t lID = get_local_id(0);
size_t gID = get_global_id(0);
if(gID < n){
sharedP[lID] = p[gID];
barrier(CLK_LOCAL_MEM_FENCE);
float4 p1 = sharedP[lID];
//To prevent i!=gID of other method
for(size_t i = 0; i < get_local_size(0);; ++i){
if(lID!=i){
float4 p2 = p1 - sharedP[i];
//Some other calculations to update 'val' at 0
}
}
for(size_t block = 1; block < (size_t)(get_global_size(0)/get_local_size(0)); ++block){
size_t idx = lID + (get_group_id(0) + block ) * get_local_size(0);
if( idx >= n)
idx -= n;
sharedP[lID] = p[idx];
barrier(CLK_LOCAL_MEM_FENCE);
for(size_t i = 0; i < get_local_size(0); ++i){
float4 p = p1 - sharedP[i];
//Some other calculations to update 'val'
}
f[gID] = val;
}
}
}
So how is writing to memory causing it to slow down so much and why is using shared memory not providing a significant performance increase?

Related

Threads indexing out of bounds in CUDA kernel

I am running a CUDA kernel which seems to be indexing out of bounds and I can not figure out why. I get error 8 write-of-size in cuda-memcheck.
I have tried to change the number of blocks and the number of threads in each block as well as only running a fraction of all iterations needed. Here is some usefull information as well as a replicable example which gives the error:
blockSize: 128
numBlocks: 512
Nvidia GTX 970
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector>
#include <iterator>
#include <cuda_profiler_api.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <stdio.h>
#include <fstream>
__host__
int NchooseK(const int &N, const int &K)
{
int result = 1;
for (int i = 1; i <= K; i++)
{
result *= N - (K - i);
result /= i;
}
return result;
}
__host__
inline int get_flatten_size(const unsigned int N){
int sum = 0;
for(int i=1; i<=N ; i++){
sum +=i*NchooseK(N,i);
}
return sum;
}
__host__
std::vector<int> comb(const int &N, const int &K, const int &length)
//void comb(int N, int K, int length)
{
int k;
std::vector<int> vec(K);
std::vector<int> flatten_vec(0);
std::string bitmask(K, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
for (int j = 0; j < length; j++) {
k = 0;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) {
//std::cout << i << " ";
vec[k] = i;
k++;
}
//std::cout << std::endl;
}
std::prev_permutation(bitmask.begin(), bitmask.end());
flatten_vec.insert(flatten_vec.end(), vec.begin(),vec.end());
}
return flatten_vec;
}
__host__
void get_matrix_indices(const unsigned int N, int *sub_col, int *sub_size, int *cumulative_size)
{
int size, itterator = 0;
cumulative_size[0] = 0;
std::vector<int> size_i_columns;
std::vector<int> all_columns(0);
for(int i=1; i<=N; i++){
size = NchooseK(N,i);
size_i_columns = comb(N,i,size);
for(int j=0; j<size; j++){
sub_size[itterator]=i;
cumulative_size[itterator+1]=cumulative_size[itterator]+i;
itterator++;
}
all_columns.insert(all_columns.end(),size_i_columns.begin(),size_i_columns.end());
}
//sub_col = &all_columns[0];
for(int i = 0; i < all_columns.size(); i++) sub_col[i] = all_columns[i];
}
__global__
void comb_ols(const unsigned int M, const unsigned int N, int* sub_col, int *sub_size, int* cumulative_size, const unsigned int numberOfCalculations, const unsigned int max_size){
int size;
int start_index;
int index = blockIdx.x*blockDim.x+threadIdx.x;
int stride = blockDim.x*gridDim.x;
double *sub_matrix = new double[M*(1+max_size)];
for(int i = index; i < numberOfCalculations; i+=stride){
size = sub_size[i];
start_index = cumulative_size[i];
for(int j = 0; j < size; j++){
for(int k = 0; k<M; k++){
sub_matrix[k] = 1;
}
}
}
delete [] sub_matrix;
}
And then we the main function:
int main()
{
int N = 17;
int M = 263;
const unsigned int regressors = N-1;
const unsigned int numberOfCalculations = (int) (exp2((double) regressors) - 1);
const unsigned int size_sub_col = get_flatten_size(regressors);
int blockSize =128;
int numBlocks = (numberOfCalculations + blockSize-1)/blockSize;
std::cout << "\nblockSize :" << blockSize;
std::cout << "\nnumBlocks :" << numBlocks;
std::cout << "\nblockSize*numBlocks :" << blockSize*numBlocks;
std::cout << "\nregressors :" << regressors;
std::cout << "\nNumberOfCalculations :" << numberOfCalculations;
std::cout << "\nsize_sub_col :" << size_sub_col << '\n' ;
int *sub_size, *cumulative_size, *sub_columns;
cudaMallocManaged(&sub_size, numberOfCalculations*sizeof(int));
cudaMallocManaged(&cumulative_size, (numberOfCalculations+1)*sizeof(int));
cudaMallocManaged(&sub_columns, size_sub_col*sizeof(int));
get_matrix_indices(regressors,sub_columns, sub_size, cumulative_size);
const unsigned int max_size = N*M;
cudaProfilerStart();
comb_ols<<<numBlocks, blockSize>>>(M,N,sub_columns, sub_size, cumulative_size, numberOfCalculations, max_size);
cudaProfilerStop();
cudaDeviceSynchronize();
cudaFree(sub_size);
cudaFree(cumulative_size);
cudaFree(sub_columns);
return 0;
}
I fail to see why the threads would try to access illegal memory space. The way I understood is that the matrix sub_matrix will be initilized on each thread once and then the parallel for loop happens. Thus should each thread have the necessary memory space. Am I allocating too much memory on the GPU? How is "new sub_matrix" handled here?
If I read your code correctly, each thread is attempting to allocate M * (1 + M*N) doubles, which is 263 * ( 1 + 263*17) = ‭1,176,136‬ doubles, or 8.97Mb of heap memory per thread. You launch 128 * 512 threads. That would mean you require 588Gb of heap space for the kernel to run successfully.
Clearly your GPU lacks that amount of memory and the out of bounds memory access comes from failures in the new call (which you can check for, BTW).
Might I suggest that something in the size calculations for the heap memory you require is wrong. Otherwise you have an extremely unrealistic problem for the GPU and will require some other approach.
Note that even if you manage to redesign things to limit the code to a feasible malloc heap memory size, you will still need, in all likelihood, to resize the malloc heap to a suitable size before running the kernel. The cudaDeviceSetLimit API can be used for this.

Confusion about my QuickSort algorithm & Mergesort algorithm

I am currently conducting empirical studies to evaluate the run-time complexities of the quicksort, and mergesort algorithms. To do this I run a random number generator that stores whatever amount of numbers I specify in a binary file. The ranges of those numbers are from 1-1,000,000.I then run tests of each algorithm starting from 100,000 numbers, incrementing by 50,000 each time, until 1,000,000 numbers are sorted on the last run. So 20 tests each. I have successfully completed each algorithm but my results are kind of puzzingly. This is a graph showing my results.
I understand that quicksort has a worst case of O(n2) time, but typically O(n·lg(n)) time. Mergesort has Θ(n·lg(n)) time.
Also I would like to note that when I started the timer I just used clock() from time.h, and calculated the time elapsed. I started my timer one line of code before I called my sorting function.
What I dont understand is how my graph shows mergesort is always double the time, and reaching triple the time to sort numbers compared to quicksort.
My only thought is that for my mergesort algorithm every time I divide my array in half I use malloc to create a new integer array for each half. Of course this means a large amount of calls are made to malloc considering the number sizes I am sorting.
int* mergeSort(int* nums, int size){
int* left;
int* right;
int middle = size/2;
if(size <= 1)
return nums;
split(nums, size, &left, &right, middle);
//I dont understand why the code below wouldnt work in place of the split()
//when i run it, in main, nothing gets printed out. I guess i lose my pointer to the beginning of my array.
//left = nums;
//right = nums+middle;
left = mergeSort(left, middle);
right = mergeSort(right, size - middle);
merge(nums,left,right,middle,size - middle);
free(left);
free(right);
return nums;
}
void split(int* nums, int size, int** left, int** right, int middle){
int *lft = (int*) malloc ((sizeof(int) * middle));
int *rght = (int*) malloc ((sizeof(int) * size - middle));
int mid = middle;
int upMid = size - middle;
int i;
for(i=0; i < mid; i++)
lft[i] = nums[i];
for(i=0; i < upMid; i++)
rght[i] = nums[i+middle];
*left = lft;
*right = rght;
}
void merge(int* num, int* left, int* right, int sizeLeft, int sizeRight){
int i,j,k,n;
i=j=k=0;
n=sizeLeft + sizeRight;
while(k < n){
if(i< sizeLeft){
if(j<sizeRight){
insert(num,left,right,&i,&j,&k);
}
else{
append(num, left, sizeLeft, &i, &k);
}
}
else{
append(num,right,sizeRight,&j,&k);
}
}
}
void insert(int* num, int* left, int* right, int* i, int* j, int* k){
/*int i,j,k,n;*/
if(left[*i]<right[*j]){
num[*k] = left[*i];
(*i)++;
}
else{
num[*k] = right[*j];
(*j)++;
}
(*k)++;
}
void append(int* num, int* half, int sizeHalf, int* i, int* k){
while(*i < sizeHalf){
num[*k]= half[*i];
(*i)++; (*k)++;
}
}
I would greatly appreciate any feedback on this question of mine, and any advice on maybe making my mergesort function more efficient. Thanks!!
I have implemented a merge sort algorithm, you can have a look. I malloc a bak array at the beginning of mergeSort and every merge use the it afterwards.
#include <string>
#include <stdlib.h>
void _mergeSort(int *array, int *bakArray, int len) ;
void mergeSort(int *array, int len)
{
int *bak = (int *)malloc(sizeof(int)*len) ;
_mergeSort(array, bak, len) ;
free(bak) ;
}
void _mergeSort(int *array, int *bakArray, int len)
{
if (len >= 2) {
int leftLen = len/2 ;
_mergeSort(array, bakArray, leftLen) ;
_mergeSort(array+leftLen, bakArray, len-leftLen) ;
int *pa = array ;
int *pb = array+leftLen ;
int aIndex = 0 ;
int bIndex = 0 ;
while (aIndex < leftLen && bIndex < len-leftLen) {
int a = pa[aIndex] ;
int b = pb[bIndex] ;
if (a < b) {
bakArray[aIndex+bIndex] = a ;
++aIndex ;
} else if (a == b) {
bakArray[aIndex+bIndex] = a ;
bakArray[aIndex+bIndex+1] = a ;
++aIndex ;
++bIndex ;
} else {
bakArray[aIndex+bIndex] = b ;
++bIndex ;
}
}
if (aIndex < leftLen) {
memcpy(bakArray+aIndex+bIndex, pa+aIndex, sizeof(int)*(leftLen-aIndex)) ;
} else if (bIndex < len-leftLen) {
memcpy(bakArray+aIndex+bIndex, pb+bIndex, sizeof(int)*(len-leftLen-bIndex)) ;
}
memcpy(array, bakArray, sizeof(int)*len) ;
}
}
static const int MaxArraySize = 100 ;
int main()
{
srand(time(NULL)) ;
int array[MaxArraySize] ;
for (int i = 0 ; i < MaxArraySize; ++i) {
array[i] = rand() % 10000 ;
}
mergeSort(array, MaxArraySize) ;
for (int i = 0 ; i < MaxArraySize; ++i) {
printf("%d ", array[i]) ;
}
printf("\n") ;
return 0 ;
}

QuadTree or KD Tree for objective c? [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking for code must demonstrate a minimal understanding of the problem being solved. Include attempted solutions, why they didn't work, and the expected results. See also: Stack Overflow question checklist
Closed 9 years ago.
Improve this question
I'm looking a while for a decent piece of code to use in my app, in one of those algorithms.
I found this example: http://rosettacode.org/wiki/K-d_tree#C
But when I put the code in xcode, I get an errors, for example:
"use of undeclared identifier", "expected ';' at the end of declaration".
I guess a header file is missing?
I copied the code from the link and made a minor edit which moved
"swap" from being an inline nested function to a static function.
Compiled with "gcc -C99 file.c" and it compiled ok. So, no, it doesn't
need some include file. Maybe you mis pasted it.
If you are happy with this answer, you could accept it. Thanks.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define MAX_DIM 3
struct kd_node_t{
double x[MAX_DIM];
struct kd_node_t *left, *right;
};
inline double
dist(struct kd_node_t *a, struct kd_node_t *b, int dim)
{
double t, d = 0;
while (dim--) {
t = a->x[dim] - b->x[dim];
d += t * t;
}
return d;
}
static void swap(struct kd_node_t *x, struct kd_node_t *y) {
double tmp[MAX_DIM];
memcpy(tmp, x->x, sizeof(tmp));
memcpy(x->x, y->x, sizeof(tmp));
memcpy(y->x, tmp, sizeof(tmp));
}
/* see quickselect method */
struct kd_node_t*
find_median(struct kd_node_t *start, struct kd_node_t *end, int idx)
{
if (end <= start) return NULL;
if (end == start + 1)
return start;
struct kd_node_t *p, *store, *md = start + (end - start) / 2;
double pivot;
while (1) {
pivot = md->x[idx];
swap(md, end - 1);
for (store = p = start; p < end; p++) {
if (p->x[idx] < pivot) {
if (p != store)
swap(p, store);
store++;
}
}
swap(store, end - 1);
/* median has duplicate values */
if (store->x[idx] == md->x[idx])
return md;
if (store > md) end = store;
else start = store;
}
}
struct kd_node_t*
make_tree(struct kd_node_t *t, int len, int i, int dim)
{
struct kd_node_t *n;
if (!len) return 0;
if ((n = find_median(t, t + len, i))) {
i = (i + 1) % dim;
n->left = make_tree(t, n - t, i, dim);
n->right = make_tree(n + 1, t + len - (n + 1), i, dim);
}
return n;
}
/* global variable, so sue me */
int visited;
void nearest(struct kd_node_t *root, struct kd_node_t *nd, int i, int dim,
struct kd_node_t **best, double *best_dist)
{
double d, dx, dx2;
if (!root) return;
d = dist(root, nd, dim);
dx = root->x[i] - nd->x[i];
dx2 = dx * dx;
visited ++;
if (!*best || d < *best_dist) {
*best_dist = d;
*best = root;
}
/* if chance of exact match is high */
if (!*best_dist) return;
if (++i >= dim) i = 0;
nearest(dx > 0 ? root->left : root->right, nd, i, dim, best, best_dist);
if (dx2 >= *best_dist) return;
nearest(dx > 0 ? root->right : root->left, nd, i, dim, best, best_dist);
}
#define N 1000000
#define rand1() (rand() / (double)RAND_MAX)
#define rand_pt(v) { v.x[0] = rand1(); v.x[1] = rand1(); v.x[2] = rand1(); }
int main(void)
{
int i;
struct kd_node_t wp[] = {
{{2, 3}}, {{5, 4}}, {{9, 6}}, {{4, 7}}, {{8, 1}}, {{7, 2}}
};
struct kd_node_t this = {{9, 2}};
struct kd_node_t *root, *found, *million;
double best_dist;
root = make_tree(wp, sizeof(wp) / sizeof(wp[1]), 0, 2);
visited = 0;
found = 0;
nearest(root, &this, 0, 2, &found, &best_dist);
printf(">> WP tree\nsearching for (%g, %g)\n"
"found (%g, %g) dist %g\nseen %d nodes\n\n",
this.x[0], this.x[1],
found->x[0], found->x[1], sqrt(best_dist), visited);
million = calloc(N, sizeof(struct kd_node_t));
srand(time(0));
for (i = 0; i < N; i++) rand_pt(million[i]);
root = make_tree(million, N, 0, 3);
rand_pt(this);
visited = 0;
found = 0;
nearest(root, &this, 0, 3, &found, &best_dist);
printf(">> Million tree\nsearching for (%g, %g, %g)\n"
"found (%g, %g, %g) dist %g\nseen %d nodes\n",
this.x[0], this.x[1], this.x[2],
found->x[0], found->x[1], found->x[2],
sqrt(best_dist), visited);
/* search many random points in million tree to see average behavior.
tree size vs avg nodes visited:
10 ~ 7
100 ~ 16.5
1000 ~ 25.5
10000 ~ 32.8
100000 ~ 38.3
1000000 ~ 42.6
10000000 ~ 46.7 */
int sum = 0, test_runs = 100000;
for (i = 0; i < test_runs; i++) {
found = 0;
visited = 0;
rand_pt(this);
nearest(root, &this, 0, 3, &found, &best_dist);
sum += visited;
}
printf("\n>> Million tree\n"
"visited %d nodes for %d random findings (%f per lookup)\n",
sum, test_runs, sum/(double)test_runs);
// free(million);
return 0;
}

CUDA Thrust sort_by_key when the key is a tuple dealt with by zip_iterator's with custom comparison predicate

I've looked through a lot of questions here for something similar and there are quite a few, albeit with one minor change. I'm trying to sort values with a zip_iterator as a compound key.
Specifically, I have the following function:
void thrustSort(
unsigned int * primaryKey,
float * secondaryKey,
unsigned int * values,
unsigned int numberOfPoints)
{
thrust::device_ptr dev_ptr_pkey = thrust::device_pointer_cast(primaryKey);
thrust::device_ptr dev_ptr_skey = thrust::device_pointer_cast(secondaryKey);
thrust::device_ptr dev_ptr_values = thrust::device_pointer_cast(values);
thrust::tuple,thrust::device_ptr> keytup_begin =
thrust::make_tuple,thrust::device_ptr>(dev_ptr_pkey, dev_ptr_skey);
thrust::zip_iterator, thrust::device_ptr > > first =
thrust::make_zip_iterator, thrust::device_ptr > >(keytup_begin);
thrust::sort_by_key(first, first + numberOfPoints, dev_ptr_values, ZipComparator());
}
and this custom predicate:
typedef thrust::device_ptr<unsigned int> tdp_uint ;
typedef thrust::device_ptr<float> tdp_float ;
typedef thrust::tuple<tdp_uint, tdp_float> tdp_uif_tuple ;
struct ZipComparator
{
__host__ __device__
inline bool operator() (const tdp_uif_tuple &a, const tdp_uif_tuple &b)
{
if(a.head < b.head) return true;
if(a.head == b.head) return a.tail < b.tail;
return false;
}
};
The errors I'm getting are:
Error 1 error : no instance of constructor "thrust::device_ptr::device_ptr [with T=unsigned int]" matches the argument list C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\include\thrust\detail\tuple.inl 309 1 ---
Error 2 error : no instance of constructor "thrust::device_ptr::device_ptr [with T=float]" matches the argument list C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\include\thrust\detail\tuple.inl 401 1 ---
Any ideas what might cause this / how do I write a predicate that indeed works?
Thanks in Advance,
Nathan
The comparator takes arguments of type const thrust::tuple<unsigned int, float>&. The const tdp_uif_tuple& type you defined expands to const thrust::tuple<thrust::device_ptr<unsigned int>, thrust:device_ptr<float> >&
The code below compiles for me:
struct ZipComparator
{
__host__ __device__
inline bool operator() (const thrust::tuple<unsigned int, float> &a, const thrust::tuple<unsigned int, float> &b)
{
if(a.head < b.head) return true;
if(a.head == b.head) return a.tail < b.tail;
return false;
}
};
Hope it does for you as well :)
http://code.google.com/p/thrust/wiki/QuickStartGuide#zip_iterator has more details on the zip iterator.
Not required, but if you're looking to clean up the length of those templates, you can do this:
void thrustSort(
unsigned int * primaryKey,
float * secondaryKey,
unsigned int * values,
unsigned int numberOfPoints)
{
tdp_uint dev_ptr_pkey(primaryKey);
tdp_float dev_ptr_skey(secondaryKey);
tdp_uint dev_ptr_values(values);
thrust::tuple<tdp_uint, tdp_float> keytup_begin = thrust::make_tuple(dev_ptr_pkey, dev_ptr_skey);
thrust::zip_iterator<thrust::tuple<tdp_uint, tdp_float> > first =
thrust::make_zip_iterator(keytup_begin);
thrust::sort_by_key(first, first + numberOfPoints, dev_ptr_values, ZipComparator());
}
A lot of the template arguments can be inferred from the arguments.
This is a fully worked example on how using sort_by_key when the key is a tuple dealt with by zip_iterator's and a customized comparison operator.
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include "Utilities.cuh"
// --- Defining tuple type
typedef thrust::tuple<int, int> Tuple;
/**************************/
/* TUPLE ORDERING FUNCTOR */
/**************************/
struct TupleComp
{
__host__ __device__ bool operator()(const Tuple& t1, const Tuple& t2)
{
if (t1.get<0>() < t2.get<0>())
return true;
if (t1.get<0>() > t2.get<0>())
return false;
return t1.get<1>() < t2.get<1>();
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 8;
// --- Keys and values on the host: allocation and definition
int h_keys1[N] = { 1, 3, 3, 3, 2, 3, 2, 1 };
int h_keys2[N] = { 1, 5, 3, 8, 2, 8, 1, 1 };
float h_values[N] = { 0.3, 5.1, 3.2, -0.08, 2.1, 5.2, 1.1, 0.01};
printf("\n\n");
printf("Original\n");
for (int i = 0; i < N; i++) {
printf("%i %i %f\n", h_keys1[i], h_keys2[i], h_values[i]);
}
// --- Keys and values on the device: allocation
int *d_keys1; gpuErrchk(cudaMalloc(&d_keys1, N * sizeof(int)));
int *d_keys2; gpuErrchk(cudaMalloc(&d_keys2, N * sizeof(int)));
float *d_values; gpuErrchk(cudaMalloc(&d_values, N * sizeof(float)));
// --- Keys and values: host -> device
gpuErrchk(cudaMemcpy(d_keys1, h_keys1, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys2, h_keys2, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
// --- From raw pointers to device_ptr
thrust::device_ptr<int> dev_ptr_keys1 = thrust::device_pointer_cast(d_keys1);
thrust::device_ptr<int> dev_ptr_keys2 = thrust::device_pointer_cast(d_keys2);
thrust::device_ptr<float> dev_ptr_values = thrust::device_pointer_cast(d_values);
// --- Declare outputs
thrust::device_vector<float> d_values_output(N);
thrust::device_vector<Tuple> d_keys_output(N);
auto begin_keys = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1, dev_ptr_keys2));
auto end_keys = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1 + N, dev_ptr_keys2 + N));
thrust::sort_by_key(begin_keys, end_keys, dev_ptr_values, TupleComp());
int *h_keys1_output = (int *)malloc(N * sizeof(int));
int *h_keys2_output = (int *)malloc(N * sizeof(int));
float *h_values_output = (float *)malloc(N * sizeof(float));
gpuErrchk(cudaMemcpy(h_keys1_output, d_keys1, N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_keys2_output, d_keys2, N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_values_output, d_values, N * sizeof(float), cudaMemcpyDeviceToHost));
printf("\n\n");
printf("Ordered\n");
for (int i = 0; i < N; i++) {
printf("%i %i %f\n", h_keys1_output[i], h_keys2_output[i], h_values_output[i]);
}
}

How to quickly find a image in another image using CUDA?

In my current project I need to find pixel exact position of image contained in another image of larger size. Smaller image is never rotated or stretched (so should match pixel by pixel) but it may have different brightness and some pixels in the image may be distorted. My first attemp was to do it on CPU but it was too slow. The calculations are very parallel, so I decided to use the GPU. I just started to learn CUDA and wrote my first CUDA app. My code works but it still is too slow even on GPU. When the larger image has a dimension of 1024x1280 and smaller is 128x128 program performs calculations in 2000ms on GeForce GTX 560 ti. I need to get results in less than 200ms. In the future I'll probably need a more complex algorithm, so I'd rather have even more computational power reserve. The question is how I can optimise my code to achieve that speed up?
CUDAImageLib.dll:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cutil.h>
//#define SUPPORT_ALPHA
__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
float diff = 0;
for (unsigned int y = 0; y < sh; ++y)
{
for (unsigned int x = 0; x < sw; ++x)
{
unsigned int as = (x + y * sw) * 4;
unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
diff += abs(BufferS[as] - BufferB[ab]);
diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
diff += abs(BufferS[as + 2] - BufferB[ab + 2]);
#endif
}
}
BufferOut[bx + (by * (bw - sw))] = diff;
}
extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
int aBytes = (bw * bh) * 4 * sizeof(float);
int bBytes = (sw * sh) * 4 * sizeof(float);
int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);
dim3 threadsPerBlock(32, 32);
dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);
float *dev_B = 0;
float *dev_S = 0;
float *dev_Out = 0;
unsigned int timer = 0;
float sExecutionTime = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cutCreateTimer(&timer);
cutStartTimer(timer);
// Launch a kernel on the GPU with one thread for each element.
ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_Out);
cudaFree(dev_B);
cudaFree(dev_S);
return (int)sExecutionTime;
}
extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
int minIndex = 0;
float minValue = 3.4e+38F;
for (int i = 0; i < count; ++i)
{
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
}
return minIndex;
}
C# test app:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;
namespace TestCUDAImageSearch
{
class Program
{
static void Main(string[] args)
{
using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
{
Console.WriteLine("Big " + big.Width + "x" + big.Height + " Small " + small.Width + "x" + small.Height);
Stopwatch sw = new Stopwatch();
sw.Start();
Point point = CUDAImageLIb.ImageSearch(big, small);
sw.Stop();
long t = sw.ElapsedMilliseconds;
Console.WriteLine("Image found at " + point.X + "x" + point.Y);
Console.WriteLine("total time=" + t + "ms kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
}
Console.WriteLine("Hit key");
Console.ReadKey();
}
}
}
//#define SUPPORT_HSB
using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
namespace TestCUDAImageSearch
{
public static class CUDAImageLIb
{
[DllImport("CUDAImageLib.dll")]
private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);
[DllImport("CUDAImageLib.dll")]
private static extern int FindMinCPU(float[] values, int count);
private static int _lastKernelTime = 0;
public static int LastKernelTime
{
get { return _lastKernelTime; }
}
public static Point ImageSearch(Bitmap big, Bitmap small)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
float[] diffs = new float[mx * my];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
int minIndex = FindMinCPU(diffs, diffs.Length);
return new Point(minIndex % mx, minIndex / mx);
}
public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
int nDiff = mx * my;
float[] diffs = new float[nDiff];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
List<Point> points = new List<Point>();
for(int i = 0; i < nDiff; ++i)
{
if (diffs[i] < maxDeviation)
{
points.Add(new Point(i % mx, i / mx));
}
}
return points;
}
#if SUPPORT_HSB
private static float[] ImageToFloat(Bitmap img)
{
int w = img.Width;
int h = img.Height;
float[] pix = new float[w * h * 4];
int i = 0;
for (int y = 0; y < h; ++y)
{
for (int x = 0; x < w; ++x)
{
Color c = img.GetPixel(x, y);
pix[i] = c.GetHue() / 360;
pix[i + 1] = c.GetSaturation();
pix[i + 2] = c.GetBrightness();
pix[i + 3] = c.A;
i += 4;
}
}
return pix;
}
#else
private static float[] ImageToFloat(Bitmap bmp)
{
int w = bmp.Width;
int h = bmp.Height;
int n = w * h;
float[] pix = new float[n * 4];
System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
Rectangle r = new Rectangle(0, 0, w, h);
BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
int[] pixels = new int[n];
System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
bmp.UnlockBits(bmpData);
int j = 0;
for (int i = 0; i < n; ++i)
{
pix[j] = (pixels[i] & 255) / 255.0f;
pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
j += 4;
}
return pix;
}
#endif
}
}
Looks like what you are talking about is a well known problem: Template matching. The easiest way forward is to convolve the Image (the bigger image) with the template (the smaller image). You could implement convolutions in one of two ways.
1) Modify the convolutions example from the CUDA SDK (similar to what you are doing anyway).
2) Use FFTs to implement the convolution. Ref. Convolution theorem. You will need to remember
% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));
You could use cufft to implement the 2 dimensional FFTs (After padding them appropriately). You will need to write a kernel that does element wise multiplication and then normalizes the result (because CUFFT does not normalize) before performing the inverse FFT.
For the sizes you mention, (1024 x 1280 and 128 x 128), the inputs must be padded to atleast ((1024 + 128 - 1) x (1280 + 128 -1) = 1151 x 1407). But FFTs are fastest when the (padded) inputs are powers of 2. So you will need to pad both the large and small images to size 2048 x 2048.
You could speed up your calculations by using faster memory access, for example by using
Texture Cache for the big image
Shared Memory or Constant Cache for the small image or parts of it.
But your real problem is the whole approach of your comparison. Comparing the images pixel by pixel at every possible location will never be efficient. There is just too much work to do. First you should think about finding ways to
Select the interesting image regions in the big image where the small image might be contained and only search in these
Find a faster comparison mechanism, by something representing the images that are not their pixels values. You should be able to compare the images by computing a representation with less data, e.g. a color histogram, or integral images.