Related
I am using vulkan-tutorial codes and i made modify for cubemap.
when i use VK_FORMAT_R8G8B8A8_UNORM is working with this code:
unsigned char* pixelsArray[6];
for (int i = 0; i < 6; ++i)
{
pixelsArray[i] = stbi_load(imageFileArray[i].c_str(), &texWidth, &texHeight, &texChannels, STBI_rgb_alpha);
}
VkDeviceSize allSize = texWidth * texHeight * 4 * 6;
VkDeviceSize size = texWidth * texHeight * 4 ;
VkBufferCreateInfo bufferInfo{};
...
bufferInfo.size = allSize ;
vkMapMemory(device, stagingBufferMemory, 0, AllSize, 0, &data);
for(int i = 0; i < 6; ++i)
{
memcpy( (char*) data + (size*i) , pixelsArray[i], static_cast<size_t>(size));
}
vkUnmapMemory(device, stagingBufferMemory);
VkImageCreateInfo imageInfo{};
...
imageInfo.arrayLayers = 6;
imageInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
imageInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
VkImageViewCreateInfo viewInfo{};
...
viewInfo.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
viewInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
viewInfo.subresourceRange.layerCount = 6;
but when i try VK_FORMAT_R16G16B16A16_SFLOAT is giving distorted display and no validation error with this code:
float* pixelsArray[6];
for (int i = 0; i < 6; ++i)
{
pixelsArray[i] = stbi_loadf(imageFileArray[i].c_str(), &texWidth, &texHeight, &texChannels, STBI_rgb_alpha);
}
VkDeviceSize allSize = texWidth * texHeight * 4 * 6 * 2;// I added *2
VkDeviceSize size = texWidth * texHeight * 4 * 2;// I added *2
VkBufferCreateInfo bufferInfo{};
...
bufferInfo.size = allSize ;
vkMapMemory(device, stagingBufferMemory, 0, AllSize, 0, &data);
for(int i = 0; i < 6; ++i)
{
memcpy( (char*) data + (size*i) , pixelsArray[i], static_cast<size_t>(size));
}
vkUnmapMemory(device, stagingBufferMemory);
VkImageCreateInfo imageInfo{};
...
imageInfo.arrayLayers = 6;
imageInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
imageInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
VkImageViewCreateInfo viewInfo{};
...
viewInfo.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
viewInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
viewInfo.subresourceRange.layerCount = 6;
when VK_FORMAT_R8G8B8A8_UNORM :
when VK_FORMAT_R16G16B16A16_SFLOAT :
i fixed the problem. problem was that i want to use half float but i was sending float to memcpy function.i searched how can i use half float and i found a solution without using extra library.
what i did add helper functions :
typedef unsigned int uint;
typedef unsigned short ushort;
uint as_uint(const float x)
{
return *(uint*)&x;
}
ushort float_to_half(const float x)
{
// IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint b = as_uint(x)+0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
const uint e = (b&0x7F800000)>>23; // exponent
const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
return (b&0x80000000)>>16 | (e>112)*((((e-112)<<10)&0x7C00)|m>>13) | ((e<113)&(e>101))*((((0x007FF000+m)>>(125-e))+1)>>1) | (e>143)*0x7FFF; // sign : normalized : denormalized : saturate
}
and fix problem with this helper functions :
VkDeviceSize size_2 = texWidth * texHeight * 4;// different from the above variables in question : allSize or size
//create half float for cubemap
void* half_pixelsArray[6];
half_pixelsArray[0] = new ushort[size_2];
half_pixelsArray[1] = new ushort[size_2];
half_pixelsArray[2] = new ushort[size_2];
half_pixelsArray[3] = new ushort[size_2];
half_pixelsArray[4] = new ushort[size_2];
half_pixelsArray[5] = new ushort[size_2];
//copy from float to half float
for (int i = 0; i < 6; ++i)
{
for (int j = 0; j < size_2; ++j)
{
((ushort*)half_pixelsArray[i])[j] = float_to_half( pixelsArray[i][j] );
}
}
// and change float to half flaot in memcpy
memcpy( (char*) data + (layerSize*i) , half_pixelsArray[i], static_cast<size_t>(layerSize));
I have the following Objective-C code that generates a 220hz tone, however, it doesn't sound like other 220hz tones, for example Wikipedia has a 220 tone here: https://en.wikipedia.org/wiki/File:220_Hz_sine_wave.ogg that sounds very low, mine sounds really high pitched in comparison...any ideas?
- (void)render:(AVAudioPCMBuffer *)bufferInput frequency:(float)frequency amplitude:(float)amplitude
{
const int channels = 2;
int inNumberFrames = bufferInput.frameLength;
float *const *floatChannelData = bufferInput.floatChannelData;
for (int i = 0; i < inNumberFrames ; i ++) {
float theta = frequency * i * 2.0 * M_PI / 44100;
float value = sinf(theta);
for (int channelNumber = 0; channelNumber < channels ; channelNumber++) {
float * const channelBuffer = floatChannelData[channelNumber];
channelBuffer[i] = value * amplitude;
}
}
}
I've solved an lp using CPLEX callable library (in VS2010). The lp is the following:
Maximize
obj: x1 + 2 x2 + 3 x3
Subject To
c1: - x1 + x2 + x3 <= 20
c2: x1 - 3 x2 + x3 <= 30
Bounds
0 <= x1 <= 40
End
The code is given beneath. Now I would like to make it an MIP (additional integrality constraints on the x's). I tried to do so by changing status = CPXlpopt (env, lp); into status = CPXmipopt (env, lp);. This does not work and I get the error 3003: not a mixed-integer problem. Does anybody know what I am missing here?
int main ()
{
/* Declare and allocate space for the variables and arrays where we
will store the optimization results including the status, objective
value, variable values, dual values, row slacks and variable
reduced costs. */
int solstat;
double objval;
double *x = NULL;
double *pi = NULL;
double *slack = NULL;
double *dj = NULL;
CPXENVptr env = NULL;
CPXLPptr lp = NULL;
int status = 0;
int i, j;
int cur_numrows, cur_numcols;
/* Initialize the CPLEX environment */
env = CPXopenCPLEX (&status);
/* Turn on output to the screen */
status = CPXsetintparam (env, CPX_PARAM_SCRIND, CPX_ON);
/* Turn on data checking */
status = CPXsetintparam (env, CPX_PARAM_DATACHECK, CPX_ON);
/* Create the problem. */
lp = CPXcreateprob (env, &status, "lpex1");
/* Now populate the problem with the data. */
#define NUMROWS 2
#define NUMCOLS 3
#define NUMNZ 6
/* To populate by column, we first create the rows, and then add the columns. */
int status = 0;
double obj[NUMCOLS];
double lb[NUMCOLS];
double ub[NUMCOLS];
char *colname[NUMCOLS];
int matbeg[NUMCOLS];
int matind[NUMNZ];
double matval[NUMNZ];
double rhs[NUMROWS];
char sense[NUMROWS];
char *rowname[NUMROWS];
CPXchgobjsen (env, lp, CPX_MAX); /* Problem is maximization */
/* Now create the new rows. First, populate the arrays. */
rowname[0] = "c1";
sense[0] = 'L';
rhs[0] = 20.0;
rowname[1] = "c2";
sense[1] = 'L';
rhs[1] = 30.0;
status = CPXnewrows (env, lp, NUMROWS, rhs, sense, NULL, rowname);
if ( status ) goto TERMINATE;
/* Now add the new columns. First, populate the arrays. */
obj[0] = 1.0; obj[1] = 2.0; obj[2] = 3.0;
matbeg[0] = 0; matbeg[1] = 2; matbeg[2] = 4;
matind[0] = 0; matind[2] = 0; matind[4] = 0;
matval[0] = -1.0; matval[2] = 1.0; matval[4] = 1.0;
matind[1] = 1; matind[3] = 1; matind[5] = 1;
matval[1] = 1.0; matval[3] = -3.0; matval[5] = 1.0;
lb[0] = 0.0; lb[1] = 0.0; lb[2] = 0.0;
ub[0] = 40.0; ub[1] = CPX_INFBOUND; ub[2] = CPX_INFBOUND;
colname[0] = "x1"; colname[1] = "x2"; colname[2] = "x3";
status = CPXaddcols (env, lp, NUMCOLS, NUMNZ, obj, matbeg, matind, matval, lb, ub, colname);
/* Optimize the problem and obtain solution. */
status = CPXlpopt (env, lp);
cur_numrows = CPXgetnumrows (env, lp);
cur_numcols = CPXgetnumcols (env, lp);
x = (double *) malloc (cur_numcols * sizeof(double));
slack = (double *) malloc (cur_numrows * sizeof(double));
dj = (double *) malloc (cur_numcols * sizeof(double));
pi = (double *) malloc (cur_numrows * sizeof(double));
status = CPXsolution (env, lp, &solstat, &objval, x, pi, slack, dj);
/* Write the output to the screen. */
printf ("\nSolution status = %d\n", solstat);
printf ("Solution value = %f\n\n", objval);
for (i = 0; i < cur_numrows; i++) {
printf ("Row %d: Slack = %10f Pi = %10f\n", i, slack[i], pi[i]);
}
for (j = 0; j < cur_numcols; j++) {
printf ("Column %d: Value = %10f Reduced cost = %10f\n",
j, x[j], dj[j]);
}
/* Finally, write a copy of the problem to a file. */
status = CPXwriteprob (env, lp, "lpex1.lp", NULL);
/* Free up the solution */
... (additional code to free up the solution)...
return(status)
}
In your code, you are not declaring any decision variables to be integer. That's why cplex is complaining when you try to solve your problem using a MIP solver. You are doing column-wise modeling and CPXaddcols doesn't have have a parameter for the variable type, but you can use CPXcopyctype or CPXchgctype. Since you the bounds on your decision variables are all greater than 1, you are looking for the 'I' variable type, instead of 'B' for binary.
char *ctype;
ctype = (char *) malloc(cur_numcols * sizeof(char);
for (j = 0; j < cur_numcols; j++) {
ctype[j] = 'I';
}
status = CPXcopyctype(env, lp, ctype);
/* verify status */
status = CPXmipopt (env, lp);
/* verify status */
Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking for code must demonstrate a minimal understanding of the problem being solved. Include attempted solutions, why they didn't work, and the expected results. See also: Stack Overflow question checklist
Closed 9 years ago.
Improve this question
I'm looking a while for a decent piece of code to use in my app, in one of those algorithms.
I found this example: http://rosettacode.org/wiki/K-d_tree#C
But when I put the code in xcode, I get an errors, for example:
"use of undeclared identifier", "expected ';' at the end of declaration".
I guess a header file is missing?
I copied the code from the link and made a minor edit which moved
"swap" from being an inline nested function to a static function.
Compiled with "gcc -C99 file.c" and it compiled ok. So, no, it doesn't
need some include file. Maybe you mis pasted it.
If you are happy with this answer, you could accept it. Thanks.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define MAX_DIM 3
struct kd_node_t{
double x[MAX_DIM];
struct kd_node_t *left, *right;
};
inline double
dist(struct kd_node_t *a, struct kd_node_t *b, int dim)
{
double t, d = 0;
while (dim--) {
t = a->x[dim] - b->x[dim];
d += t * t;
}
return d;
}
static void swap(struct kd_node_t *x, struct kd_node_t *y) {
double tmp[MAX_DIM];
memcpy(tmp, x->x, sizeof(tmp));
memcpy(x->x, y->x, sizeof(tmp));
memcpy(y->x, tmp, sizeof(tmp));
}
/* see quickselect method */
struct kd_node_t*
find_median(struct kd_node_t *start, struct kd_node_t *end, int idx)
{
if (end <= start) return NULL;
if (end == start + 1)
return start;
struct kd_node_t *p, *store, *md = start + (end - start) / 2;
double pivot;
while (1) {
pivot = md->x[idx];
swap(md, end - 1);
for (store = p = start; p < end; p++) {
if (p->x[idx] < pivot) {
if (p != store)
swap(p, store);
store++;
}
}
swap(store, end - 1);
/* median has duplicate values */
if (store->x[idx] == md->x[idx])
return md;
if (store > md) end = store;
else start = store;
}
}
struct kd_node_t*
make_tree(struct kd_node_t *t, int len, int i, int dim)
{
struct kd_node_t *n;
if (!len) return 0;
if ((n = find_median(t, t + len, i))) {
i = (i + 1) % dim;
n->left = make_tree(t, n - t, i, dim);
n->right = make_tree(n + 1, t + len - (n + 1), i, dim);
}
return n;
}
/* global variable, so sue me */
int visited;
void nearest(struct kd_node_t *root, struct kd_node_t *nd, int i, int dim,
struct kd_node_t **best, double *best_dist)
{
double d, dx, dx2;
if (!root) return;
d = dist(root, nd, dim);
dx = root->x[i] - nd->x[i];
dx2 = dx * dx;
visited ++;
if (!*best || d < *best_dist) {
*best_dist = d;
*best = root;
}
/* if chance of exact match is high */
if (!*best_dist) return;
if (++i >= dim) i = 0;
nearest(dx > 0 ? root->left : root->right, nd, i, dim, best, best_dist);
if (dx2 >= *best_dist) return;
nearest(dx > 0 ? root->right : root->left, nd, i, dim, best, best_dist);
}
#define N 1000000
#define rand1() (rand() / (double)RAND_MAX)
#define rand_pt(v) { v.x[0] = rand1(); v.x[1] = rand1(); v.x[2] = rand1(); }
int main(void)
{
int i;
struct kd_node_t wp[] = {
{{2, 3}}, {{5, 4}}, {{9, 6}}, {{4, 7}}, {{8, 1}}, {{7, 2}}
};
struct kd_node_t this = {{9, 2}};
struct kd_node_t *root, *found, *million;
double best_dist;
root = make_tree(wp, sizeof(wp) / sizeof(wp[1]), 0, 2);
visited = 0;
found = 0;
nearest(root, &this, 0, 2, &found, &best_dist);
printf(">> WP tree\nsearching for (%g, %g)\n"
"found (%g, %g) dist %g\nseen %d nodes\n\n",
this.x[0], this.x[1],
found->x[0], found->x[1], sqrt(best_dist), visited);
million = calloc(N, sizeof(struct kd_node_t));
srand(time(0));
for (i = 0; i < N; i++) rand_pt(million[i]);
root = make_tree(million, N, 0, 3);
rand_pt(this);
visited = 0;
found = 0;
nearest(root, &this, 0, 3, &found, &best_dist);
printf(">> Million tree\nsearching for (%g, %g, %g)\n"
"found (%g, %g, %g) dist %g\nseen %d nodes\n",
this.x[0], this.x[1], this.x[2],
found->x[0], found->x[1], found->x[2],
sqrt(best_dist), visited);
/* search many random points in million tree to see average behavior.
tree size vs avg nodes visited:
10 ~ 7
100 ~ 16.5
1000 ~ 25.5
10000 ~ 32.8
100000 ~ 38.3
1000000 ~ 42.6
10000000 ~ 46.7 */
int sum = 0, test_runs = 100000;
for (i = 0; i < test_runs; i++) {
found = 0;
visited = 0;
rand_pt(this);
nearest(root, &this, 0, 3, &found, &best_dist);
sum += visited;
}
printf("\n>> Million tree\n"
"visited %d nodes for %d random findings (%f per lookup)\n",
sum, test_runs, sum/(double)test_runs);
// free(million);
return 0;
}
In my current project I need to find pixel exact position of image contained in another image of larger size. Smaller image is never rotated or stretched (so should match pixel by pixel) but it may have different brightness and some pixels in the image may be distorted. My first attemp was to do it on CPU but it was too slow. The calculations are very parallel, so I decided to use the GPU. I just started to learn CUDA and wrote my first CUDA app. My code works but it still is too slow even on GPU. When the larger image has a dimension of 1024x1280 and smaller is 128x128 program performs calculations in 2000ms on GeForce GTX 560 ti. I need to get results in less than 200ms. In the future I'll probably need a more complex algorithm, so I'd rather have even more computational power reserve. The question is how I can optimise my code to achieve that speed up?
CUDAImageLib.dll:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cutil.h>
//#define SUPPORT_ALPHA
__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
float diff = 0;
for (unsigned int y = 0; y < sh; ++y)
{
for (unsigned int x = 0; x < sw; ++x)
{
unsigned int as = (x + y * sw) * 4;
unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
diff += abs(BufferS[as] - BufferB[ab]);
diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
diff += abs(BufferS[as + 2] - BufferB[ab + 2]);
#endif
}
}
BufferOut[bx + (by * (bw - sw))] = diff;
}
extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
int aBytes = (bw * bh) * 4 * sizeof(float);
int bBytes = (sw * sh) * 4 * sizeof(float);
int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);
dim3 threadsPerBlock(32, 32);
dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);
float *dev_B = 0;
float *dev_S = 0;
float *dev_Out = 0;
unsigned int timer = 0;
float sExecutionTime = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cutCreateTimer(&timer);
cutStartTimer(timer);
// Launch a kernel on the GPU with one thread for each element.
ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_Out);
cudaFree(dev_B);
cudaFree(dev_S);
return (int)sExecutionTime;
}
extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
int minIndex = 0;
float minValue = 3.4e+38F;
for (int i = 0; i < count; ++i)
{
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
}
return minIndex;
}
C# test app:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;
namespace TestCUDAImageSearch
{
class Program
{
static void Main(string[] args)
{
using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
{
Console.WriteLine("Big " + big.Width + "x" + big.Height + " Small " + small.Width + "x" + small.Height);
Stopwatch sw = new Stopwatch();
sw.Start();
Point point = CUDAImageLIb.ImageSearch(big, small);
sw.Stop();
long t = sw.ElapsedMilliseconds;
Console.WriteLine("Image found at " + point.X + "x" + point.Y);
Console.WriteLine("total time=" + t + "ms kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
}
Console.WriteLine("Hit key");
Console.ReadKey();
}
}
}
//#define SUPPORT_HSB
using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
namespace TestCUDAImageSearch
{
public static class CUDAImageLIb
{
[DllImport("CUDAImageLib.dll")]
private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);
[DllImport("CUDAImageLib.dll")]
private static extern int FindMinCPU(float[] values, int count);
private static int _lastKernelTime = 0;
public static int LastKernelTime
{
get { return _lastKernelTime; }
}
public static Point ImageSearch(Bitmap big, Bitmap small)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
float[] diffs = new float[mx * my];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
int minIndex = FindMinCPU(diffs, diffs.Length);
return new Point(minIndex % mx, minIndex / mx);
}
public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
int nDiff = mx * my;
float[] diffs = new float[nDiff];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
List<Point> points = new List<Point>();
for(int i = 0; i < nDiff; ++i)
{
if (diffs[i] < maxDeviation)
{
points.Add(new Point(i % mx, i / mx));
}
}
return points;
}
#if SUPPORT_HSB
private static float[] ImageToFloat(Bitmap img)
{
int w = img.Width;
int h = img.Height;
float[] pix = new float[w * h * 4];
int i = 0;
for (int y = 0; y < h; ++y)
{
for (int x = 0; x < w; ++x)
{
Color c = img.GetPixel(x, y);
pix[i] = c.GetHue() / 360;
pix[i + 1] = c.GetSaturation();
pix[i + 2] = c.GetBrightness();
pix[i + 3] = c.A;
i += 4;
}
}
return pix;
}
#else
private static float[] ImageToFloat(Bitmap bmp)
{
int w = bmp.Width;
int h = bmp.Height;
int n = w * h;
float[] pix = new float[n * 4];
System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
Rectangle r = new Rectangle(0, 0, w, h);
BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
int[] pixels = new int[n];
System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
bmp.UnlockBits(bmpData);
int j = 0;
for (int i = 0; i < n; ++i)
{
pix[j] = (pixels[i] & 255) / 255.0f;
pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
j += 4;
}
return pix;
}
#endif
}
}
Looks like what you are talking about is a well known problem: Template matching. The easiest way forward is to convolve the Image (the bigger image) with the template (the smaller image). You could implement convolutions in one of two ways.
1) Modify the convolutions example from the CUDA SDK (similar to what you are doing anyway).
2) Use FFTs to implement the convolution. Ref. Convolution theorem. You will need to remember
% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));
You could use cufft to implement the 2 dimensional FFTs (After padding them appropriately). You will need to write a kernel that does element wise multiplication and then normalizes the result (because CUFFT does not normalize) before performing the inverse FFT.
For the sizes you mention, (1024 x 1280 and 128 x 128), the inputs must be padded to atleast ((1024 + 128 - 1) x (1280 + 128 -1) = 1151 x 1407). But FFTs are fastest when the (padded) inputs are powers of 2. So you will need to pad both the large and small images to size 2048 x 2048.
You could speed up your calculations by using faster memory access, for example by using
Texture Cache for the big image
Shared Memory or Constant Cache for the small image or parts of it.
But your real problem is the whole approach of your comparison. Comparing the images pixel by pixel at every possible location will never be efficient. There is just too much work to do. First you should think about finding ways to
Select the interesting image regions in the big image where the small image might be contained and only search in these
Find a faster comparison mechanism, by something representing the images that are not their pixels values. You should be able to compare the images by computing a representation with less data, e.g. a color histogram, or integral images.