Optimizing opencl kernel

Optimizing opencl kernel - optimization

I am trying to optimize this kernel. The CPU version of this kernel is 4 times faster than the GPU version. I would expect that the GPU version would be faster.
It might be that we have a lot of memory accesses and that is why we have a low performance. I am using an Intel HD 2500 and OpenCL 1.2.
The GPU kernel is:
__kernel void mykernel(__global unsigned char *inp1,
__global unsigned char *inp2,
__global unsigned char *inp3,
__global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__global unsigned char *lut,
uint size
)
{
unsigned char x1, x2, x3, x4;
unsigned char y1, y2, y3, y4;
const int x = get_global_id(0);
const int y = get_global_id(1);
const int width = get_global_size(0);
const uint id = y * width + x;
x1 = inp1[id];
x2 = inp2[id];
x3 = inp3[id];
x4 = inp4[id];
y1 = (x1 & 0xff) | (x2>>2 & 0xaa) | (x3>>4 & 0x0d) | (x4>>6 & 0x02);
y2 = (x1<<2 & 0xff) | (x2 & 0xaa) | (x3>>2 & 0x0d) | (x4>>4 & 0x02);
y3 = (x1<<4 & 0xff) | (x2<<2 & 0xaa) | (x3 & 0x0d) | (x4>>2 & 0x02);
y4 = (x1<<6 & 0xff) | (x2<<4 & 0xaa) | (x3<<2 & 0x0d) | (x4 & 0x02);
// lookup table
y1 = lut[y1];
y2 = lut[y2];
y3 = lut[y3];
y4 = lut[y4];
outp1[id] = (y1 & 0xc0)
| ((y2 & 0xc0) >> 2)
| ((y3 & 0xc0) >> 4)
| ((y4 & 0xc0) >> 6);
outp2[id] = ((y1 & 0x30) << 2)
| (y2 & 0x30)
| ((y3 & 0x30) >> 2)
| ((y4 & 0x30) >> 4);
outp3[id] = ((y1 & 0x0c) << 4)
| ((y2 & 0x0c) << 2)
| (y3 & 0x0c)
| ((y4 & 0x0c) >> 2);
outp4[id] = ((y1 & 0x03) << 6)
| ((y2 & 0x03) << 4)
| ((y3 & 0x03) << 2)
| (y4 & 0x03);
}
I use :
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 1;
globalWorkSize[0] = X*Y; // X,Y define a data space of 15 - 20 MB
LocalWorkSize can vary between 1 - 256.
for LocalWorkSize = 1 I have
CPU = 0.067Sec
GPU = 0.20Sec
for LocalWorkSize = 256 I have
CPU = 0.067Sec
GPU = 0.34Sec
Which is really weird. Can you give me some ideas why I get these strange numbers? and do you have any tips on how I can optimize this kernel?
My main looks like this:
int main(int argc, char** argv)
{
int err,err1,j,i; // error code returned from api calls and other
clock_t start, end; // measuring performance variables
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program_ms_naive; // compute program
cl_kernel kernel_ms_naive; // compute kernel
// ... dynamically allocate arrays
// ... initialize arrays
cl_uint dev_cnt = 0;
clGetPlatformIDs(0, 0, &dev_cnt);
cl_platform_id platform_ids[100];
clGetPlatformIDs(dev_cnt, platform_ids, NULL);
// Connect to a compute device
err = clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a compute context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
commands = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute programs from the source file
program_ms_naive = clCreateProgramWithSource(context, 1, (const char **) &kernelSource_ms, NULL, &err);
// Build the programs executable
err = clBuildProgram(program_ms_naive, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel_ms_naive = clCreateKernel(program_ms_naive, "ms_naive", &err);
d_A1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A1, &err);
d_A2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A2, &err);
d_A3 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A3, &err);
d_A4 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A4, &err);
d_lut = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 256, h_ltable, &err);
d_B1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
int size = YCOLUMNS*XROWS/4;
int size_b = size * 4;
err = clSetKernelArg(kernel_ms_naive, 0, sizeof(cl_mem), (void *)&(d_A1));
err |= clSetKernelArg(kernel_ms_naive, 1, sizeof(cl_mem), (void *)&(d_A2));
err |= clSetKernelArg(kernel_ms_naive, 2, sizeof(cl_mem), (void *)&(d_A3));
err |= clSetKernelArg(kernel_ms_naive, 3, sizeof(cl_mem), (void *)&(d_A4));
err |= clSetKernelArg(kernel_ms_naive, 4, sizeof(cl_mem), (void *)&d_B1);
err |= clSetKernelArg(kernel_ms_naive, 5, sizeof(cl_mem), (void *)&(d_B2));
err |= clSetKernelArg(kernel_ms_naive, 6, sizeof(cl_mem), (void *)&(d_B3));
err |= clSetKernelArg(kernel_ms_naive, 7, sizeof(cl_mem), (void *)&(d_B4));
err |= clSetKernelArg(kernel_ms_naive, 8, sizeof(cl_mem), (void *)&d_lut); //__global
err |= clSetKernelArg(kernel_ms_naive, 9, sizeof(cl_uint), (void *)&size_b);
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 256;
globalWorkSize[0] = XROWS*YCOLUMNS;
start = clock();
for (i=0;i< EXECUTION_TIMES;i++)
{
err1 = clEnqueueNDRangeKernel(commands, kernel_ms_naive, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
err = clFinish(commands);
}
end = clock();
return 0;
}

Constant memory is used to broadcast small amount of values to all the work items and acts similar to a constant private register, thus very fast access speed. Normal GPU devices can support up to 16kb of constant memory. Should be enough to hold the LUT.
You can try with constant memory, as a simple solution for the global access bottleneck:
__kernel void mykernel(const __global unsigned char *inp1,
const __global unsigned char *inp2,
const __global unsigned char *inp3,
const __global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__constant unsigned char *lut,
uint size
)
{
...
}
But a proper solution would be to reshape your code:
Use vectors of char4 instead of 4 different buffers (because that
breaks coalescence) [It can give you a big boost up to x4]
Operate on vectors [Slight boost]
Use local/constant memory for LUT [It can reduce 1 non coalesced read of the LUT, maybe 2x-3x]
Still it will be difficult to beat the CPU approach, due to big IO constrains.

Related

Sha256 opencl kernel needed [Help needed]

i need a Sha256 kernel file , i am using Cloo as my opencl library , it will be included in WPF project
i am calculating a hash value several times
the program needs about an 30 mins or so to do that but my search result claimed opencl will reduce that time to under 3 mins or less
thanks in advance
[Edit]
ok now i managed to do it using this
https://searchcode.com/file/45893396/src/opencl/sha256_kernel.cl/
but it works fine with string
yet when sending my byteArray header to hash it returned a very different value than expected
[Edit2]
it can not handle large arrays any array more than 32 length returns missy results

Found this and i modified it to calculate double hash
if anyone needs it
#ifndef uint8_t
#define uint8_t unsigned char
#endif
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#ifndef uint64_t
#define uint64_t unsigned long int
#endif
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
typedef struct
{
uint32_t state[8];
uint64_t count;
uint8_t buffer[64];
} CSha256;
inline void Sha256_Init(CSha256 *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch2(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define sha_a(i) T[(0-(i))&7]
#define sha_b(i) T[(1-(i))&7]
#define sha_c(i) T[(2-(i))&7]
#define sha_d(i) T[(3-(i))&7]
#define sha_e(i) T[(4-(i))&7]
#define sha_f(i) T[(5-(i))&7]
#define sha_g(i) T[(6-(i))&7]
#define sha_h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch2(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, i+1); \
R(g,h,a,b,c,d,e,f, i+2); \
R(f,g,h,a,b,c,d,e, i+3); \
R(e,f,g,h,a,b,c,d, i+4); \
R(d,e,f,g,h,a,b,c, i+5); \
R(c,d,e,f,g,h,a,b, i+6); \
R(b,c,d,e,f,g,h,a, i+7)
#else
#define R(i) sha_h(i) += S1(sha_e(i)) + Ch2(sha_e(i),sha_f(i),sha_g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
sha_d(i) += sha_h(i); sha_h(i) += S0(sha_a(i)) + Maj(sha_a(i), sha_b(i), sha_c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
inline static void Sha256_Transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16];
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
inline static void Sha256_WriteByteBlock(CSha256 *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
Sha256_Transform(p->state, data32);
}
inline void Sha256_Update(CSha256 *p, __global const uint8_t *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final(CSha256 *p, __global uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
inline void Sha256_Update1(CSha256 *p, const uint8_t *data, uint32_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final1(CSha256 *p, uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
__kernel void Sha256_1(__global uint8_t *header,__global uint8_t *toRet)
{
uint8_t tempHdr[80];
uint8_t tempDigest[32]={0};
uint startNon=toRet[0] + (toRet[1] << 8) + (toRet[2] << 16) + (toRet[3] << 24);
uint maxNon=toRet[4] + (toRet[5] << 8) + (toRet[6] << 16) + (toRet[7] << 24);
uint nonce =startNon;
uint32_t finalNon=0;
uint8_t match=0;
for(int x=0;x<80;x++)
tempHdr[x]=header[x];
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
while(finalNon<1)
{
CSha256 p;
Sha256_Init(&p);
Sha256_Update1(&p, tempHdr, 80);
Sha256_Final1(&p, tempDigest);
CSha256 p1;
Sha256_Init(&p1);
Sha256_Update1(&p1, tempDigest, 32);
Sha256_Final1(&p1, tempDigest);
for(int x=31;x>21;x--)
{
if(tempDigest[x]<1) match++;
}
if(match>8)
{
finalNon=nonce;
toRet[8] = (char)(nonce);
toRet[9] = (char)(nonce >> 8);
toRet[10] = (char)(nonce >> 16);
toRet[11] = (char)(nonce >> 24);
}
else
{
nonce++;
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
}
match=0;
if(nonce>maxNon) break;
if(nonce<=startNon) break;
}
}

How to get 4 bytes of data (uint8_t) into a variable of type uint32_t

I've been working with Cypress BLE PSoC 4200, and I've set up my GATT database to send int32 data packets to my iPhone. However, you can only write to the GATT database with uint8 pieces of data. So I wrote the following to take this int32 voltage reading and put it into a uint8 byte array:
// function passes in int32 variable 'result'
uint8 array[4];
array[0] = result & 0xFF;
array[1] = (result >> 8) & 0xFF;
array[2] = (result >> 16) & 0xFF;
array[3] = (result >> 24) & 0xFF;
So, given that in mind, when that int32 packet gets sent, I want to be able take each byte, and recombine them somehow into the original int32 value, and print it to the screen (e.g. 456000 will be 0.456 V).
Right now, I obtain the 4 bytes and handle them like such:
NSData* data = [characteristic value];
const uint8_t *reportData = [data bytes];
// variable to hold the eventual 32-bit data
uint32_t voltage = 0;
Is there a way to go through each index of *reportData and concatenate the bytes? Any help will do, thanks.

Would something like this not work?
uint32_t v0 = (uint32_t)reportData[0];
uint32_t v1 = (uint32_t)reportData[1] << 8;
uint32_t v2 = (uint32_t)reportData[2] << 16;
uint32_t v3 = (uint32_t)reportData[3] << 24;
uint32_t voltage = v0 | v1 | v2 | v3;

strange Kinect acceleration values with OpenNI and avin SensorKinect

according to [1], I should be able to access Kinect accelerometer data with request 0x32, providing a buffer with 10 bytes. The accelerometer vector xyz values should be short ints at bytes 3 thru 8. As stated in the text (and as expected), with a horizontally, stationary camera, I should get values near 0 for both x and z, and about 981 for y. This would be g and would make sense. Instead, while y values are as expected, I get x and z values near 0xffff. Here's the code (i skipped error-checking for better readability):
const unsigned short VENDOR_ID_MSFT = 0x045e;
const unsigned short PRODUCT_ID_KINECT360_MOTOR = 0x02b0;
XN_USB_DEV_HANDLE deviceHandle = NULL;
const XnUSBConnectionString *paths = NULL;
XnUInt32 count;
xnUSBInit();
xnUSBEnumerateDevices( VENDOR_ID_MSFT, PRODUCT_ID_KINECT360_MOTOR, &paths, &count );
xnUSBOpenDeviceByPath( paths[0], &this->deviceHandle );
//init motor
xnUSBSendControl( this->deviceHandle, (XnUSBControlType) 0xc0, 0x10, 0x00, 0x00, buf, sizeof( buf ), 0 );
XnStatus res;
XnUChar buf[10] = { 0 };
XnUInt32 size = 0;
//query motor data
xnUSBReceiveControl( this->deviceHandle, XN_USB_CONTROL_TYPE_VENDOR, 0x32, 0, 0, buf, sizeof( buf ), &size, 0 );
int accelCountX = (int) ( ( (short) buf[2] << 8 ) | buf[3] );
int accelCountY = (int) ( ( (short) buf[4] << 8 ) | buf[5] );
int accelCountZ = (int) ( ( (short) buf[6] << 8 ) | buf[7] );
std::cout << accelCountX << "/" << accelCountY << "/" << accelCountZ << std::endl;
The output shows values kind of like these:
65503/847/65516
Any idea, what the problem could be? Thanks!
[1] http://fivedots.coe.psu.ac.th/~ad/jg/nui16/motorControl.pdf

Setup the accelerator framework for fft on the iPhone

I have set a function to setup the accelerator, after i have read :
Using the Apple FFT and Accelerate Framework
iPhone FFT with Accelerate framework vDSP
and apple docs.
i did this :
void fftSetup()
{
COMPLEX_SPLIT A;
FFTSetup setupReal;
uint32_t log2n;
uint32_t n, nOver2;
int32_t stride;
uint32_t i;
float *originalReal, *obtainedReal;
float scale;
uint32_t L = 1024;
float *mag = new float[L/2];
log2n = 10 ;
n = 1 << log2n;
stride = 1;
nOver2 = n / 2;
printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n);
for (i = 0; i < n; i++)
originalReal[i] = (float) (i + 1);
vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2);
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE);
//get magnitude;
for(i = 1; i < L/2; i++){
mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]);
}
scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
}
questions :
my app is always crash with no error(BAD ACCESS) on one of this 2 lines :
originalReal[i] = (float) (i + 1); // or
vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2);
i guess i did not set a good value for log2n ? (10 to get 1024 window ? )
how do i get the real magnitude of the bins? my actual fft? the same i wrote here ?
where do i input MY data buffer array (exactly where in my code ? instead originalReal?)
thanks a lot.

I actually manage to make it work ,when i insert into it a sin wave of a certain f.
This is the code :
COMPLEX_SPLIT A;
FFTSetup setupReal;
uint32_t log2n;
uint32_t n, nOver2;
int32_t stride;
uint32_t i;
float *originalReal, *obtainedReal;
float scale;
uint32_t L = 1024;
float *mag = new float[L/2];
log2n = 10 ;
n = 1 << log2n;
stride = 1;
nOver2 = n / 2;
//printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n);
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
originalReal = (float *) malloc(n * sizeof(float));
obtainedReal = (float *) malloc(n * sizeof(float));
for (i = 0; i < n; i++)
originalReal[i] = cos(2*3.141592*11000*i/44100);//(float) (i + 1);
vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2);
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
//vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE);
scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
//get magnitude;
for(i = 1; i < L/2; i++)
{
mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]);
NSLog(#"%d:%f",i,mag[i]);
}
Actually its not 44hz between bins,as the guy wrote in the post above! but 43 ! 22050/512=43 . this thing is critical ! because in the higher bins- such as bin[300] you get a completely different resault for 44 and 43 ! (its 300hz drift). so take care of that .

how to change value of NSColor object into its 8 bit value

I need to convert Value of NSColor object into 8 bit integer value
code :
uint8_t r = (uint32_t)(MIN(1.0f, MAX(0.0f, [[CWhiteBoardController ReturnFillColor] redComponent])) * 0xff);
uint8_t g = (uint32_t)(MIN(1.0f, MAX(0.0f, [[CWhiteBoardController ReturnFillColor] greenComponent])) * 0xff);
uint8_t b = (uint32_t)(MIN(1.0f, MAX(0.0f, [[CWhiteBoardController ReturnFillColor] blueComponent])) * 0xff);
uint8_t a = (uint32_t)(MIN(1.0f, MAX(0.0f, [[CWhiteBoardController ReturnFillColor] alphaComponent])) * 0xff);
uint8_t value = (a << 24) | (r<< 16) | (g << 8) | b;
value that I received is 0.
i am not getting where I am wrong.
So anyone help me out plz.

I come to know the problem,actually I need to write
int value = (a << 24) | (r<< 16) | (g << 8) | b;
in place of
uint8_t value = (a << 24) | (r<< 16) | (g << 8) | b;

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Optimizing opencl kernel - optimization

Related

Sha256 opencl kernel needed [Help needed]

How to get 4 bytes of data (uint8_t) into a variable of type uint32_t

strange Kinect acceleration values with OpenNI and avin SensorKinect

Setup the accelerator framework for fft on the iPhone

how to change value of NSColor object into its 8 bit value

Categories

Resources