Related
I am using vulkan-tutorial codes and i made modify for cubemap.
when i use VK_FORMAT_R8G8B8A8_UNORM is working with this code:
unsigned char* pixelsArray[6];
for (int i = 0; i < 6; ++i)
{
pixelsArray[i] = stbi_load(imageFileArray[i].c_str(), &texWidth, &texHeight, &texChannels, STBI_rgb_alpha);
}
VkDeviceSize allSize = texWidth * texHeight * 4 * 6;
VkDeviceSize size = texWidth * texHeight * 4 ;
VkBufferCreateInfo bufferInfo{};
...
bufferInfo.size = allSize ;
vkMapMemory(device, stagingBufferMemory, 0, AllSize, 0, &data);
for(int i = 0; i < 6; ++i)
{
memcpy( (char*) data + (size*i) , pixelsArray[i], static_cast<size_t>(size));
}
vkUnmapMemory(device, stagingBufferMemory);
VkImageCreateInfo imageInfo{};
...
imageInfo.arrayLayers = 6;
imageInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
imageInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
VkImageViewCreateInfo viewInfo{};
...
viewInfo.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
viewInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
viewInfo.subresourceRange.layerCount = 6;
but when i try VK_FORMAT_R16G16B16A16_SFLOAT is giving distorted display and no validation error with this code:
float* pixelsArray[6];
for (int i = 0; i < 6; ++i)
{
pixelsArray[i] = stbi_loadf(imageFileArray[i].c_str(), &texWidth, &texHeight, &texChannels, STBI_rgb_alpha);
}
VkDeviceSize allSize = texWidth * texHeight * 4 * 6 * 2;// I added *2
VkDeviceSize size = texWidth * texHeight * 4 * 2;// I added *2
VkBufferCreateInfo bufferInfo{};
...
bufferInfo.size = allSize ;
vkMapMemory(device, stagingBufferMemory, 0, AllSize, 0, &data);
for(int i = 0; i < 6; ++i)
{
memcpy( (char*) data + (size*i) , pixelsArray[i], static_cast<size_t>(size));
}
vkUnmapMemory(device, stagingBufferMemory);
VkImageCreateInfo imageInfo{};
...
imageInfo.arrayLayers = 6;
imageInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
imageInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
VkImageViewCreateInfo viewInfo{};
...
viewInfo.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
viewInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
viewInfo.subresourceRange.layerCount = 6;
when VK_FORMAT_R8G8B8A8_UNORM :
when VK_FORMAT_R16G16B16A16_SFLOAT :
i fixed the problem. problem was that i want to use half float but i was sending float to memcpy function.i searched how can i use half float and i found a solution without using extra library.
what i did add helper functions :
typedef unsigned int uint;
typedef unsigned short ushort;
uint as_uint(const float x)
{
return *(uint*)&x;
}
ushort float_to_half(const float x)
{
// IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint b = as_uint(x)+0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
const uint e = (b&0x7F800000)>>23; // exponent
const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
return (b&0x80000000)>>16 | (e>112)*((((e-112)<<10)&0x7C00)|m>>13) | ((e<113)&(e>101))*((((0x007FF000+m)>>(125-e))+1)>>1) | (e>143)*0x7FFF; // sign : normalized : denormalized : saturate
}
and fix problem with this helper functions :
VkDeviceSize size_2 = texWidth * texHeight * 4;// different from the above variables in question : allSize or size
//create half float for cubemap
void* half_pixelsArray[6];
half_pixelsArray[0] = new ushort[size_2];
half_pixelsArray[1] = new ushort[size_2];
half_pixelsArray[2] = new ushort[size_2];
half_pixelsArray[3] = new ushort[size_2];
half_pixelsArray[4] = new ushort[size_2];
half_pixelsArray[5] = new ushort[size_2];
//copy from float to half float
for (int i = 0; i < 6; ++i)
{
for (int j = 0; j < size_2; ++j)
{
((ushort*)half_pixelsArray[i])[j] = float_to_half( pixelsArray[i][j] );
}
}
// and change float to half flaot in memcpy
memcpy( (char*) data + (layerSize*i) , half_pixelsArray[i], static_cast<size_t>(layerSize));
I am trying to copy unsigned short from native code to managed code, but I get a heap corruption when calling memcpy.
INPUT: unsigned short* input
OUTPUT: array<unsigned short> output
I have the following code and if I set testDataSize is 100 then I don't see corruption.
Could someone please shed some light ?
Thanks,
typedef unsigned short uns16;
// DLL Entry Point
void main()
{
int testDataSize = 600;
int frSize = testDataSize / 2;
for (int j = 0; j < 1; j++)
{
uns16* input;
array<uns16>^ output1;
array<uns16>^ output2;
input = new uns16(frSize);
output1 = gcnew array <uns16>(frSize);
output2 = gcnew array <uns16>(frSize);
// initialize
for (int i = 0; i < frSize; i++)
{
input[i] = i;
}
//test 1
Stopwatch^ sw1 = Stopwatch::StartNew();
//-------------------------------------------------------------------
array<short>^ frameDataSigned = gcnew array<short>(frSize);
Marshal::Copy(IntPtr((void*)(input)), frameDataSigned, 0, frameDataSigned->Length);
System::Buffer::BlockCopy(frameDataSigned, 0, output1, 0, (Int32)(frSize) * 2);
//-------------------------------------------------------------------
auto res1 = sw1->ElapsedTicks;
//test 2
Stopwatch^ sw2 = Stopwatch::StartNew();
//-------------------------------------------------------------------
cli::pin_ptr<uns16> pinnedManagedData = &output2[0];
memcpy(pinnedManagedData, (void*)(input), frSize * sizeof(uns16));
//-------------------------------------------------------------------
auto res2 = sw2->ElapsedTicks;
....
int frSize = 300;
input = new uns16(frSize);
This doesn't allocate an array. It allocates a single uint16_t, and sets its value to 300. You need to use square brackets to allocate an array.
input = new uns16[frSize];
On my Arduino Mega 2560, I'm trying to run a motor that turns a 20-vial container (accepting int input 1-20) while regulating temperature via PID of a separate cooler. I am generally new to this field of technology so bear with me. I also have an interrupt set up for an encoder to keep track of vial position.
The void serialEvent() and void loop() are the most important portions to look at, but I decided to put the rest of the code in there just in case you needed to see it.
#include <PID_v1.h>
#include <SPI.h>
#include <TMC26XStepper.h>
#define COOL_INPUT 0
#define PIN_OUTPUT 9
TMC26XStepper tmc26XStepper = TMC26XStepper(200,5,7,6,500);
int step = 6;
int value;
int i;
char junk = ' ';
volatile long enc_count = 0;
const byte interruptPinA = 2;
const byte interruptPinB = 3;
//Define Variables we'll be connecting to
int outMax = 255;
int outMin = -145;
double Setpoint, Input, Output;
double heatInput, heatOutput, originalInput;
//Specify the links and initial tuning parameters
// AGGRESSIVE VALUES (to get to 4 deg C)
double aggKp=8.0, aggKi=3.0, aggKd=0.15;
// CONSERVATIVE VALUES (to hover around 4 deg C)
double consKp=2.5, consKi = 0.0, consKd = 1.0;
PID myPID(&Input, &Output, &Setpoint, aggKp, aggKi, aggKd, REVERSE);
void setup()
{
pinMode(step, OUTPUT);
pinMode(interruptPinA, INPUT_PULLUP);
pinMode(interruptPinB, INPUT_PULLUP);
attachInterrupt(digitalPinToInterrupt(interruptPinA), encoder_isr, CHANGE);
attachInterrupt(digitalPinToInterrupt(interruptPinB), encoder_isr, CHANGE);
//initialize the variables we're linked to
Input = (5.0*analogRead(COOL_INPUT)*100.0) / 1024;
Setpoint = 10.75;
myPID.SetOutputLimits(outMin, outMax);
//turn the PID on
myPID.SetMode(AUTOMATIC);
Serial.begin(115200);
tmc26XStepper.setSpreadCycleChopper(2,24,8,6,0);
tmc26XStepper.setMicrosteps(32);
tmc26XStepper.setStallGuardThreshold(4,0);
Serial.println("...started...");
tmc26XStepper.start();
Serial.flush();
Serial.println("Enter vial numbers 1-20");
}
void loop() {
Input = (5.0*analogRead(COOL_INPUT)*100.0) / 1024;
// A BUNCH OF CODE FOR TEMP REGULATION
Serial.println(Input);
delay(150);
}
void serialEvent() {
while (Serial.available() == 0) {}
i = Serial.parseInt();
Serial.print("position: ");
Serial.print(i);
Serial.print(" ");
while (Serial.available() > 0) {
junk = Serial.read();
}
if (i == 1) {
value = 0;
} else {
int num = i - 1;
value = num * 72;
}
while (enc_count != value) {
digitalWrite(6, HIGH);
delayMicroseconds(100);
digitalWrite(6, LOW);
delayMicroseconds(100);
if (enc_count == 1440) {
enc_count = 0;
}
}
Serial.println(enc_count);
}
// INFO FOR ENCODER
void encoder_isr() {
static int8_t lookup_table[] = {0,-1,1,0,1,0,0,-1,-1,0,0,1,0,1,-1,0};
static uint8_t enc_val = 0;
enc_val = enc_val << 2;
enc_val = enc_val | ((PIND & 0b1100) >> 2);
enc_count = enc_count + lookup_table[enc_val & 0b1111];
}
So, originally I had the two processes tested separately (vial position + encoder, then temperature regulation) and everything did exactly as it was supposed to. Now, I fused the code together and stored the vial position entry in the serialEvent() method to keep the temperature reading continuous and the vial position entry available for whenever I decided to provide input. However, when I put in a value, the program stops all together. I am able to see the number I entered (position: 5), but the Serial.println(enc_count) never gets printed. On top of the that, the temperature readings stop displaying readings.
Any thoughts? Need more information?
I've been given the task of creating a Login API for our project and I'm supposed to use PBKDF2 with HMACSHA256 as the PRF. The plain text password is hashed using MD5 and then fed into the PBKDF2 to generate a derived key. The problem is, I'm not able to get the same output as what the project documentation is telling me.
Here's the PBKDF2 Implementation in Java:
public class PBKDF2
{
public static byte[] deriveKey( byte[] password, byte[] salt, int iterationCount, int dkLen )
throws java.security.NoSuchAlgorithmException, java.security.InvalidKeyException
{
SecretKeySpec keyspec = new SecretKeySpec( password, "HmacSHA256" );
Mac prf = Mac.getInstance( "HmacSHA256" );
prf.init( keyspec );
// Note: hLen, dkLen, l, r, T, F, etc. are horrible names for
// variables and functions in this day and age, but they
// reflect the terse symbols used in RFC 2898 to describe
// the PBKDF2 algorithm, which improves validation of the
// code vs. the RFC.
//
// dklen is expressed in bytes. (16 for a 128-bit key)
int hLen = prf.getMacLength(); // 20 for SHA1
int l = Math.max( dkLen, hLen); // 1 for 128bit (16-byte) keys
int r = dkLen - (l-1)*hLen; // 16 for 128bit (16-byte) keys
byte T[] = new byte[l * hLen];
int ti_offset = 0;
for (int i = 1; i <= l; i++) {
F( T, ti_offset, prf, salt, iterationCount, i );
ti_offset += hLen;
}
if (r < hLen) {
// Incomplete last block
byte DK[] = new byte[dkLen];
System.arraycopy(T, 0, DK, 0, dkLen);
return DK;
}
return T;
}
private static void F( byte[] dest, int offset, Mac prf, byte[] S, int c, int blockIndex ) {
final int hLen = prf.getMacLength();
byte U_r[] = new byte[ hLen ];
// U0 = S || INT (i);
byte U_i[] = new byte[S.length + 4];
System.arraycopy( S, 0, U_i, 0, S.length );
INT( U_i, S.length, blockIndex );
for( int i = 0; i < c; i++ ) {
U_i = prf.doFinal( U_i );
xor( U_r, U_i );
}
System.arraycopy( U_r, 0, dest, offset, hLen );
}
private static void xor( byte[] dest, byte[] src ) {
for( int i = 0; i < dest.length; i++ ) {
dest[i] ^= src[i];
}
}
private static void INT( byte[] dest, int offset, int i ) {
dest[offset + 0] = (byte) (i / (256 * 256 * 256));
dest[offset + 1] = (byte) (i / (256 * 256));
dest[offset + 2] = (byte) (i / (256));
dest[offset + 3] = (byte) (i);
}
// ctor
private PBKDF2 () {}
}
I used test vectors found here PBKDF2-HMAC-SHA2 test vectors to verify the correctness of the implementation and it all checked out. I'm not sure why I couldn't the same results with an MD5 hashed password.
Parameters:
Salt: 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
Iterations Count: 1000
DKLen: 16 (128-bit derived key)
Using "foobar" as the plaintext password, the expected results are:
PWHash = MD5(PlaintextPassword) = 3858f62230ac3c915f300c664312c63f
PWKey = PBKDF2(PWHash, Salt, IterationsCount, DKLen) = 33C37758EFA6780C5E52FAB3B50F329C
What I get:
PWHash = 3858f62230ac3c915f300c664312c63f
PWKey = 0bd0c7d8339df2c66ce4b6e1e91ed3f1
The iterations count was supposed to 4096, not 1000.
The generation of int l seems wrong. You have specified the maximum between dkLen and hLen but the spec says l = CEIL (dkLen / hLen) with
CEIL (x) is the "ceiling" function, i.e. the smallest integer greater than, or equal to, x.
I think l would be more accurately defined as l = (int)Math.ceil( (double)dkLen / (double)hLen )
In my current project I need to find pixel exact position of image contained in another image of larger size. Smaller image is never rotated or stretched (so should match pixel by pixel) but it may have different brightness and some pixels in the image may be distorted. My first attemp was to do it on CPU but it was too slow. The calculations are very parallel, so I decided to use the GPU. I just started to learn CUDA and wrote my first CUDA app. My code works but it still is too slow even on GPU. When the larger image has a dimension of 1024x1280 and smaller is 128x128 program performs calculations in 2000ms on GeForce GTX 560 ti. I need to get results in less than 200ms. In the future I'll probably need a more complex algorithm, so I'd rather have even more computational power reserve. The question is how I can optimise my code to achieve that speed up?
CUDAImageLib.dll:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cutil.h>
//#define SUPPORT_ALPHA
__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
float diff = 0;
for (unsigned int y = 0; y < sh; ++y)
{
for (unsigned int x = 0; x < sw; ++x)
{
unsigned int as = (x + y * sw) * 4;
unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
diff += abs(BufferS[as] - BufferB[ab]);
diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
diff += abs(BufferS[as + 2] - BufferB[ab + 2]);
#endif
}
}
BufferOut[bx + (by * (bw - sw))] = diff;
}
extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
int aBytes = (bw * bh) * 4 * sizeof(float);
int bBytes = (sw * sh) * 4 * sizeof(float);
int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);
dim3 threadsPerBlock(32, 32);
dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);
float *dev_B = 0;
float *dev_S = 0;
float *dev_Out = 0;
unsigned int timer = 0;
float sExecutionTime = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cutCreateTimer(&timer);
cutStartTimer(timer);
// Launch a kernel on the GPU with one thread for each element.
ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_Out);
cudaFree(dev_B);
cudaFree(dev_S);
return (int)sExecutionTime;
}
extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
int minIndex = 0;
float minValue = 3.4e+38F;
for (int i = 0; i < count; ++i)
{
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
}
return minIndex;
}
C# test app:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;
namespace TestCUDAImageSearch
{
class Program
{
static void Main(string[] args)
{
using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
{
Console.WriteLine("Big " + big.Width + "x" + big.Height + " Small " + small.Width + "x" + small.Height);
Stopwatch sw = new Stopwatch();
sw.Start();
Point point = CUDAImageLIb.ImageSearch(big, small);
sw.Stop();
long t = sw.ElapsedMilliseconds;
Console.WriteLine("Image found at " + point.X + "x" + point.Y);
Console.WriteLine("total time=" + t + "ms kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
}
Console.WriteLine("Hit key");
Console.ReadKey();
}
}
}
//#define SUPPORT_HSB
using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
namespace TestCUDAImageSearch
{
public static class CUDAImageLIb
{
[DllImport("CUDAImageLib.dll")]
private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);
[DllImport("CUDAImageLib.dll")]
private static extern int FindMinCPU(float[] values, int count);
private static int _lastKernelTime = 0;
public static int LastKernelTime
{
get { return _lastKernelTime; }
}
public static Point ImageSearch(Bitmap big, Bitmap small)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
float[] diffs = new float[mx * my];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
int minIndex = FindMinCPU(diffs, diffs.Length);
return new Point(minIndex % mx, minIndex / mx);
}
public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
int nDiff = mx * my;
float[] diffs = new float[nDiff];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
List<Point> points = new List<Point>();
for(int i = 0; i < nDiff; ++i)
{
if (diffs[i] < maxDeviation)
{
points.Add(new Point(i % mx, i / mx));
}
}
return points;
}
#if SUPPORT_HSB
private static float[] ImageToFloat(Bitmap img)
{
int w = img.Width;
int h = img.Height;
float[] pix = new float[w * h * 4];
int i = 0;
for (int y = 0; y < h; ++y)
{
for (int x = 0; x < w; ++x)
{
Color c = img.GetPixel(x, y);
pix[i] = c.GetHue() / 360;
pix[i + 1] = c.GetSaturation();
pix[i + 2] = c.GetBrightness();
pix[i + 3] = c.A;
i += 4;
}
}
return pix;
}
#else
private static float[] ImageToFloat(Bitmap bmp)
{
int w = bmp.Width;
int h = bmp.Height;
int n = w * h;
float[] pix = new float[n * 4];
System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
Rectangle r = new Rectangle(0, 0, w, h);
BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
int[] pixels = new int[n];
System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
bmp.UnlockBits(bmpData);
int j = 0;
for (int i = 0; i < n; ++i)
{
pix[j] = (pixels[i] & 255) / 255.0f;
pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
j += 4;
}
return pix;
}
#endif
}
}
Looks like what you are talking about is a well known problem: Template matching. The easiest way forward is to convolve the Image (the bigger image) with the template (the smaller image). You could implement convolutions in one of two ways.
1) Modify the convolutions example from the CUDA SDK (similar to what you are doing anyway).
2) Use FFTs to implement the convolution. Ref. Convolution theorem. You will need to remember
% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));
You could use cufft to implement the 2 dimensional FFTs (After padding them appropriately). You will need to write a kernel that does element wise multiplication and then normalizes the result (because CUFFT does not normalize) before performing the inverse FFT.
For the sizes you mention, (1024 x 1280 and 128 x 128), the inputs must be padded to atleast ((1024 + 128 - 1) x (1280 + 128 -1) = 1151 x 1407). But FFTs are fastest when the (padded) inputs are powers of 2. So you will need to pad both the large and small images to size 2048 x 2048.
You could speed up your calculations by using faster memory access, for example by using
Texture Cache for the big image
Shared Memory or Constant Cache for the small image or parts of it.
But your real problem is the whole approach of your comparison. Comparing the images pixel by pixel at every possible location will never be efficient. There is just too much work to do. First you should think about finding ways to
Select the interesting image regions in the big image where the small image might be contained and only search in these
Find a faster comparison mechanism, by something representing the images that are not their pixels values. You should be able to compare the images by computing a representation with less data, e.g. a color histogram, or integral images.