How to quickly find a image in another image using CUDA? - optimization

In my current project I need to find pixel exact position of image contained in another image of larger size. Smaller image is never rotated or stretched (so should match pixel by pixel) but it may have different brightness and some pixels in the image may be distorted. My first attemp was to do it on CPU but it was too slow. The calculations are very parallel, so I decided to use the GPU. I just started to learn CUDA and wrote my first CUDA app. My code works but it still is too slow even on GPU. When the larger image has a dimension of 1024x1280 and smaller is 128x128 program performs calculations in 2000ms on GeForce GTX 560 ti. I need to get results in less than 200ms. In the future I'll probably need a more complex algorithm, so I'd rather have even more computational power reserve. The question is how I can optimise my code to achieve that speed up?
CUDAImageLib.dll:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cutil.h>
//#define SUPPORT_ALPHA
__global__ void ImageSearch_kernel(float* BufferOut, float* BufferB, float* BufferS, unsigned int bw, unsigned int bh, unsigned int sw, unsigned int sh)
{
unsigned int bx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int by = threadIdx.y + blockIdx.y * blockDim.y;
float diff = 0;
for (unsigned int y = 0; y < sh; ++y)
{
for (unsigned int x = 0; x < sw; ++x)
{
unsigned int as = (x + y * sw) * 4;
unsigned int ab = (x + bx + (y + by) * bw) * 4;
#ifdef SUPPORT_ALPHA
diff += ((abs(BufferS[as] - BufferB[ab]) + abs(BufferS[as + 1] - BufferB[ab + 1]) + abs(BufferS[as + 2] - BufferB[ab + 2])) * BufferS[as + 3] * BufferB[ab + 3]);
#else
diff += abs(BufferS[as] - BufferB[ab]);
diff += abs(BufferS[as + 1] - BufferB[ab + 1]);
diff += abs(BufferS[as + 2] - BufferB[ab + 2]);
#endif
}
}
BufferOut[bx + (by * (bw - sw))] = diff;
}
extern "C" int __declspec(dllexport) __stdcall ImageSearchGPU(float* BufferOut, float* BufferB, float* BufferS, int bw, int bh, int sw, int sh)
{
int aBytes = (bw * bh) * 4 * sizeof(float);
int bBytes = (sw * sh) * 4 * sizeof(float);
int cBytes = ((bw - sw) * (bh - sh)) * sizeof(float);
dim3 threadsPerBlock(32, 32);
dim3 numBlocks((bw - sw) / threadsPerBlock.x, (bh - sh) / threadsPerBlock.y);
float *dev_B = 0;
float *dev_S = 0;
float *dev_Out = 0;
unsigned int timer = 0;
float sExecutionTime = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_Out, cBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_B, aBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_S, bBytes);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_B, BufferB, aBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_S, BufferS, bBytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cutCreateTimer(&timer);
cutStartTimer(timer);
// Launch a kernel on the GPU with one thread for each element.
ImageSearch_kernel<<<numBlocks, threadsPerBlock>>>(dev_Out, dev_B, dev_S, bw, bh, sw, sh);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cutStopTimer(timer);
sExecutionTime = cutGetTimerValue(timer);
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(BufferOut, dev_Out, cBytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_Out);
cudaFree(dev_B);
cudaFree(dev_S);
return (int)sExecutionTime;
}
extern "C" int __declspec(dllexport) __stdcall FindMinCPU(float* values, int count)
{
int minIndex = 0;
float minValue = 3.4e+38F;
for (int i = 0; i < count; ++i)
{
if (values[i] < minValue)
{
minValue = values[i];
minIndex = i;
}
}
return minIndex;
}
C# test app:
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Drawing;
namespace TestCUDAImageSearch
{
class Program
{
static void Main(string[] args)
{
using(Bitmap big = new Bitmap("Big.png"), small = new Bitmap("Small.png"))
{
Console.WriteLine("Big " + big.Width + "x" + big.Height + " Small " + small.Width + "x" + small.Height);
Stopwatch sw = new Stopwatch();
sw.Start();
Point point = CUDAImageLIb.ImageSearch(big, small);
sw.Stop();
long t = sw.ElapsedMilliseconds;
Console.WriteLine("Image found at " + point.X + "x" + point.Y);
Console.WriteLine("total time=" + t + "ms kernel time=" + CUDAImageLIb.LastKernelTime + "ms");
}
Console.WriteLine("Hit key");
Console.ReadKey();
}
}
}
//#define SUPPORT_HSB
using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
namespace TestCUDAImageSearch
{
public static class CUDAImageLIb
{
[DllImport("CUDAImageLib.dll")]
private static extern int ImageSearchGPU(float[] bufferOut, float[] bufferB, float[] bufferS, int bw, int bh, int sw, int sh);
[DllImport("CUDAImageLib.dll")]
private static extern int FindMinCPU(float[] values, int count);
private static int _lastKernelTime = 0;
public static int LastKernelTime
{
get { return _lastKernelTime; }
}
public static Point ImageSearch(Bitmap big, Bitmap small)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
float[] diffs = new float[mx * my];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
int minIndex = FindMinCPU(diffs, diffs.Length);
return new Point(minIndex % mx, minIndex / mx);
}
public static List<Point> ImageSearch(Bitmap big, Bitmap small, float maxDeviation)
{
int bw = big.Width;
int bh = big.Height;
int sw = small.Width;
int sh = small.Height;
int mx = (bw - sw);
int my = (bh - sh);
int nDiff = mx * my;
float[] diffs = new float[nDiff];
float[] b = ImageToFloat(big);
float[] s = ImageToFloat(small);
_lastKernelTime = ImageSearchGPU(diffs, b, s, bw, bh, sw, sh);
List<Point> points = new List<Point>();
for(int i = 0; i < nDiff; ++i)
{
if (diffs[i] < maxDeviation)
{
points.Add(new Point(i % mx, i / mx));
}
}
return points;
}
#if SUPPORT_HSB
private static float[] ImageToFloat(Bitmap img)
{
int w = img.Width;
int h = img.Height;
float[] pix = new float[w * h * 4];
int i = 0;
for (int y = 0; y < h; ++y)
{
for (int x = 0; x < w; ++x)
{
Color c = img.GetPixel(x, y);
pix[i] = c.GetHue() / 360;
pix[i + 1] = c.GetSaturation();
pix[i + 2] = c.GetBrightness();
pix[i + 3] = c.A;
i += 4;
}
}
return pix;
}
#else
private static float[] ImageToFloat(Bitmap bmp)
{
int w = bmp.Width;
int h = bmp.Height;
int n = w * h;
float[] pix = new float[n * 4];
System.Diagnostics.Debug.Assert(bmp.PixelFormat == PixelFormat.Format32bppArgb);
Rectangle r = new Rectangle(0, 0, w, h);
BitmapData bmpData = bmp.LockBits(r, ImageLockMode.ReadOnly, bmp.PixelFormat);
System.Diagnostics.Debug.Assert(bmpData.Stride > 0);
int[] pixels = new int[n];
System.Runtime.InteropServices.Marshal.Copy(bmpData.Scan0, pixels, 0, n);
bmp.UnlockBits(bmpData);
int j = 0;
for (int i = 0; i < n; ++i)
{
pix[j] = (pixels[i] & 255) / 255.0f;
pix[j + 1] = ((pixels[i] >> 8) & 255) / 255.0f;
pix[j + 2] = ((pixels[i] >> 16) & 255) / 255.0f;
pix[j + 3] = ((pixels[i] >> 24) & 255) / 255.0f;
j += 4;
}
return pix;
}
#endif
}
}

Looks like what you are talking about is a well known problem: Template matching. The easiest way forward is to convolve the Image (the bigger image) with the template (the smaller image). You could implement convolutions in one of two ways.
1) Modify the convolutions example from the CUDA SDK (similar to what you are doing anyway).
2) Use FFTs to implement the convolution. Ref. Convolution theorem. You will need to remember
% MATLAB format
L = size(A) + size(B) - 1;
conv2(A, B) = IFFT2(FFT2(A, L) .* FFT2(B, L));
You could use cufft to implement the 2 dimensional FFTs (After padding them appropriately). You will need to write a kernel that does element wise multiplication and then normalizes the result (because CUFFT does not normalize) before performing the inverse FFT.
For the sizes you mention, (1024 x 1280 and 128 x 128), the inputs must be padded to atleast ((1024 + 128 - 1) x (1280 + 128 -1) = 1151 x 1407). But FFTs are fastest when the (padded) inputs are powers of 2. So you will need to pad both the large and small images to size 2048 x 2048.

You could speed up your calculations by using faster memory access, for example by using
Texture Cache for the big image
Shared Memory or Constant Cache for the small image or parts of it.
But your real problem is the whole approach of your comparison. Comparing the images pixel by pixel at every possible location will never be efficient. There is just too much work to do. First you should think about finding ways to
Select the interesting image regions in the big image where the small image might be contained and only search in these
Find a faster comparison mechanism, by something representing the images that are not their pixels values. You should be able to compare the images by computing a representation with less data, e.g. a color histogram, or integral images.

Related

Vulkan: Loading floating point cubemap textures distorted

I am using vulkan-tutorial codes and i made modify for cubemap.
when i use VK_FORMAT_R8G8B8A8_UNORM is working with this code:
unsigned char* pixelsArray[6];
for (int i = 0; i < 6; ++i)
{
pixelsArray[i] = stbi_load(imageFileArray[i].c_str(), &texWidth, &texHeight, &texChannels, STBI_rgb_alpha);
}
VkDeviceSize allSize = texWidth * texHeight * 4 * 6;
VkDeviceSize size = texWidth * texHeight * 4 ;
VkBufferCreateInfo bufferInfo{};
...
bufferInfo.size = allSize ;
vkMapMemory(device, stagingBufferMemory, 0, AllSize, 0, &data);
for(int i = 0; i < 6; ++i)
{
memcpy( (char*) data + (size*i) , pixelsArray[i], static_cast<size_t>(size));
}
vkUnmapMemory(device, stagingBufferMemory);
VkImageCreateInfo imageInfo{};
...
imageInfo.arrayLayers = 6;
imageInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
imageInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
VkImageViewCreateInfo viewInfo{};
...
viewInfo.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
viewInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
viewInfo.subresourceRange.layerCount = 6;
but when i try VK_FORMAT_R16G16B16A16_SFLOAT is giving distorted display and no validation error with this code:
float* pixelsArray[6];
for (int i = 0; i < 6; ++i)
{
pixelsArray[i] = stbi_loadf(imageFileArray[i].c_str(), &texWidth, &texHeight, &texChannels, STBI_rgb_alpha);
}
VkDeviceSize allSize = texWidth * texHeight * 4 * 6 * 2;// I added *2
VkDeviceSize size = texWidth * texHeight * 4 * 2;// I added *2
VkBufferCreateInfo bufferInfo{};
...
bufferInfo.size = allSize ;
vkMapMemory(device, stagingBufferMemory, 0, AllSize, 0, &data);
for(int i = 0; i < 6; ++i)
{
memcpy( (char*) data + (size*i) , pixelsArray[i], static_cast<size_t>(size));
}
vkUnmapMemory(device, stagingBufferMemory);
VkImageCreateInfo imageInfo{};
...
imageInfo.arrayLayers = 6;
imageInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
imageInfo.flags = VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
VkImageViewCreateInfo viewInfo{};
...
viewInfo.viewType = VK_IMAGE_VIEW_TYPE_CUBE;
viewInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
viewInfo.subresourceRange.layerCount = 6;
when VK_FORMAT_R8G8B8A8_UNORM :
when VK_FORMAT_R16G16B16A16_SFLOAT :
i fixed the problem. problem was that i want to use half float but i was sending float to memcpy function.i searched how can i use half float and i found a solution without using extra library.
what i did add helper functions :
typedef unsigned int uint;
typedef unsigned short ushort;
uint as_uint(const float x)
{
return *(uint*)&x;
}
ushort float_to_half(const float x)
{
// IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
const uint b = as_uint(x)+0x00001000; // round-to-nearest-even: add last bit after truncated mantissa
const uint e = (b&0x7F800000)>>23; // exponent
const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
return (b&0x80000000)>>16 | (e>112)*((((e-112)<<10)&0x7C00)|m>>13) | ((e<113)&(e>101))*((((0x007FF000+m)>>(125-e))+1)>>1) | (e>143)*0x7FFF; // sign : normalized : denormalized : saturate
}
and fix problem with this helper functions :
VkDeviceSize size_2 = texWidth * texHeight * 4;// different from the above variables in question : allSize or size
//create half float for cubemap
void* half_pixelsArray[6];
half_pixelsArray[0] = new ushort[size_2];
half_pixelsArray[1] = new ushort[size_2];
half_pixelsArray[2] = new ushort[size_2];
half_pixelsArray[3] = new ushort[size_2];
half_pixelsArray[4] = new ushort[size_2];
half_pixelsArray[5] = new ushort[size_2];
//copy from float to half float
for (int i = 0; i < 6; ++i)
{
for (int j = 0; j < size_2; ++j)
{
((ushort*)half_pixelsArray[i])[j] = float_to_half( pixelsArray[i][j] );
}
}
// and change float to half flaot in memcpy
memcpy( (char*) data + (layerSize*i) , half_pixelsArray[i], static_cast<size_t>(layerSize));

Why does writing to memory take longer than reading in GPU?

Here is part of my code where I update some values and write it to another part of my memory
__kernel void update(
__global float4 *f,
__global float4 *p,
const unsigned int n,
__local float4 *sharedP){
float4 val = (float4)(0.f,0.f,0.f,0.f);
size_t lID = get_local_id(0);
size_t gID = get_global_id(0);
if(gID < n){
float4 p1 = p[gID];
for(size_t i = 0; i < n; ++i){
if(i!=gID){
float4 p2 = p1 - p[i];
//Some other calculations to finally update 'val'
}
}
f[gID] = val;
}
}
Here each thread reads from the global memory n+1 times, but writes back only once.(My initial thought was that, the bottleneck was due repeated call to the global memory, hence I had made some changes to copy a block of data into the shared memory, swap it and other things. Even this did not improve the performance a lot). However later I found that the bottleneck is due to f[gID] = val;. For example when the kernel is run 200 times with n = 8192, it takes 3.5s; but if I comment out f[gID] = val; it takes only 0.05s.
Code for other method
__kernel void update(
__global float4 *f,
__global float4 *p,
const unsigned int n,
__local float4 *sharedP){
float4 val = (float4)(0.f,0.f,0.f,0.f);
size_t lID = get_local_id(0);
size_t gID = get_global_id(0);
if(gID < n){
sharedP[lID] = p[gID];
barrier(CLK_LOCAL_MEM_FENCE);
float4 p1 = sharedP[lID];
//To prevent i!=gID of other method
for(size_t i = 0; i < get_local_size(0);; ++i){
if(lID!=i){
float4 p2 = p1 - sharedP[i];
//Some other calculations to update 'val' at 0
}
}
for(size_t block = 1; block < (size_t)(get_global_size(0)/get_local_size(0)); ++block){
size_t idx = lID + (get_group_id(0) + block ) * get_local_size(0);
if( idx >= n)
idx -= n;
sharedP[lID] = p[idx];
barrier(CLK_LOCAL_MEM_FENCE);
for(size_t i = 0; i < get_local_size(0); ++i){
float4 p = p1 - sharedP[i];
//Some other calculations to update 'val'
}
f[gID] = val;
}
}
}
So how is writing to memory causing it to slow down so much and why is using shared memory not providing a significant performance increase?

mupdf render jpeg2000 lose color?

I am working on an android project, which use vudroid, which in turn use mupdf version 0.5.
Vudroid remove the original openjpeg support of mupdf, I have ported the mupdf version 1.5's openjpeg support.
But I encounter a new problem, color information in jpx image gone, the desired effect:
my effect:
the ported load-jpx code:
#include "fitz.h"
#include "mupdf.h"
/* Without the definition of OPJ_STATIC, compilation fails on windows
* due to the use of __stdcall. We believe it is required on some
* linux toolchains too. */
#define OPJ_STATIC
#ifndef _MSC_VER
#define OPJ_HAVE_STDINT_H
#endif
#include <openjpeg.h>
static void fz_opj_error_callback(const char *msg, void *client_data)
{
//fz_context *ctx = (fz_context *)client_data;
//fz_warn(ctx, "openjpeg error: %s", msg);
}
static void fz_opj_warning_callback(const char *msg, void *client_data)
{
//fz_context *ctx = (fz_context *)client_data;
//fz_warn(ctx, "openjpeg warning: %s", msg);
}
static void fz_opj_info_callback(const char *msg, void *client_data)
{
/* fz_warn("openjpeg info: %s", msg); */
}
typedef struct stream_block_s
{
unsigned char *data;
int size;
int pos;
} stream_block;
static OPJ_SIZE_T fz_opj_stream_read(void * p_buffer, OPJ_SIZE_T p_nb_bytes, void * p_user_data)
{
stream_block *sb = (stream_block *)p_user_data;
int len;
len = sb->size - sb->pos;
if (len < 0)
len = 0;
if (len == 0)
return (OPJ_SIZE_T)-1; /* End of file! */
if ((OPJ_SIZE_T)len > p_nb_bytes)
len = p_nb_bytes;
memcpy(p_buffer, sb->data + sb->pos, len);
sb->pos += len;
return len;
}
static OPJ_OFF_T fz_opj_stream_skip(OPJ_OFF_T skip, void * p_user_data)
{
stream_block *sb = (stream_block *)p_user_data;
if (skip > sb->size - sb->pos)
skip = sb->size - sb->pos;
sb->pos += skip;
return sb->pos;
}
static OPJ_BOOL fz_opj_stream_seek(OPJ_OFF_T seek_pos, void * p_user_data)
{
stream_block *sb = (stream_block *)p_user_data;
if (seek_pos > sb->size)
return OPJ_FALSE;
sb->pos = seek_pos;
return OPJ_TRUE;
}
fz_error
fz_load_jpx(pdf_image* img, unsigned char *data, int size, fz_colorspace *defcs, int indexed)
{
//fz_pixmap *img;
opj_dparameters_t params;
opj_codec_t *codec;
opj_image_t *jpx;
opj_stream_t *stream;
fz_colorspace *colorspace;
unsigned char *p;
OPJ_CODEC_FORMAT format;
int a, n, w, h, depth, sgnd;
int x, y, k, v;
stream_block sb;
if (size < 2)
fz_throw("not enough data to determine image format");
/* Check for SOC marker -- if found we have a bare J2K stream */
if (data[0] == 0xFF && data[1] == 0x4F)
format = OPJ_CODEC_J2K;
else
format = OPJ_CODEC_JP2;
opj_set_default_decoder_parameters(&params);
if (indexed)
params.flags |= OPJ_DPARAMETERS_IGNORE_PCLR_CMAP_CDEF_FLAG;
codec = opj_create_decompress(format);
opj_set_info_handler(codec, fz_opj_info_callback, 0);
opj_set_warning_handler(codec, fz_opj_warning_callback, 0);
opj_set_error_handler(codec, fz_opj_error_callback, 0);
if (!opj_setup_decoder(codec, &params))
{
fz_throw("j2k decode failed");
}
stream = opj_stream_default_create(OPJ_TRUE);
sb.data = data;
sb.pos = 0;
sb.size = size;
opj_stream_set_read_function(stream, fz_opj_stream_read);
opj_stream_set_skip_function(stream, fz_opj_stream_skip);
opj_stream_set_seek_function(stream, fz_opj_stream_seek);
opj_stream_set_user_data(stream, &sb);
/* Set the length to avoid an assert */
opj_stream_set_user_data_length(stream, size);
if (!opj_read_header(stream, codec, &jpx))
{
opj_stream_destroy(stream);
opj_destroy_codec(codec);
fz_throw("Failed to read JPX header");
}
if (!opj_decode(codec, stream, jpx))
{
opj_stream_destroy(stream);
opj_destroy_codec(codec);
opj_image_destroy(jpx);
fz_throw("Failed to decode JPX image");
}
opj_stream_destroy(stream);
opj_destroy_codec(codec);
/* jpx should never be NULL here, but check anyway */
if (!jpx)
fz_throw("opj_decode failed");
pdf_logimage("opj_decode succeeded");
for (k = 1; k < (int)jpx->numcomps; k++)
{
if (!jpx->comps[k].data)
{
opj_image_destroy(jpx);
fz_throw("image components are missing data");
}
if (jpx->comps[k].w != jpx->comps[0].w)
{
opj_image_destroy(jpx);
fz_throw("image components have different width");
}
if (jpx->comps[k].h != jpx->comps[0].h)
{
opj_image_destroy(jpx);
fz_throw("image components have different height");
}
if (jpx->comps[k].prec != jpx->comps[0].prec)
{
opj_image_destroy(jpx);
fz_throw("image components have different precision");
}
}
n = jpx->numcomps;
w = jpx->comps[0].w;
h = jpx->comps[0].h;
depth = jpx->comps[0].prec;
sgnd = jpx->comps[0].sgnd;
if (jpx->color_space == OPJ_CLRSPC_SRGB && n == 4) { n = 3; a = 1; }
else if (jpx->color_space == OPJ_CLRSPC_SYCC && n == 4) { n = 3; a = 1; }
else if (n == 2) { n = 1; a = 1; }
else if (n > 4) { n = 4; a = 1; }
else { a = 0; }
if (defcs)
{
if (defcs->n == n)
{
colorspace = defcs;
}
else
{
fz_warn("jpx file and dict colorspaces do not match");
defcs = NULL;
}
}
if (!defcs)
{
switch (n)
{
case 1: colorspace = pdf_devicegray; break;
case 3: colorspace = pdf_devicergb; break;
case 4: colorspace = pdf_devicecmyk; break;
}
}
//error = fz_new_pixmap(&img, colorspace, w, h);
//if (error)
// return error;
pdf_logimage("colorspace handled\n");
int bpc = 1;
if (colorspace) {
bpc = 1 + colorspace->n;
};
pdf_logimage("w = %d, bpc = %d, h = %d\n", w, bpc, h);
img->samples = fz_newbuffer(w * bpc * h);
//opj_image_destroy(jpx);
//fz_throw("out of memory loading jpx");
p = (char*)img->samples->bp;
pdf_logimage("start to deal with samples");
for (y = 0; y < h; y++)
{
for (x = 0; x < w; x++)
{
for (k = 0; k < n + a; k++)
{
v = jpx->comps[k].data[y * w + x];
if (sgnd)
v = v + (1 << (depth - 1));
if (depth > 8)
v = v >> (depth - 8);
*p++ = v;
}
if (!a)
*p++ = 255;
}
}
img->samples->wp = p;
pdf_logimage("start to deal with samples succeeded");
opj_image_destroy(jpx);
// if (a)
// {
// if (n == 4)
// {
// fz_pixmap *tmp = fz_new_pixmap(ctx, fz_device_rgb(ctx), w, h);
// fz_convert_pixmap(ctx, tmp, img);
// fz_drop_pixmap(ctx, img);
// img = tmp;
// }
// fz_premultiply_pixmap(ctx, img);
// }
return fz_okay;
}
The render code:
JNIEXPORT jbyteArray JNICALL Java_org_vudroid_pdfdroid_codec_PdfPage_drawPage
(JNIEnv *env, jclass clazz, jlong dochandle, jlong pagehandle)
{
renderdocument_t *doc = (renderdocument_t*) dochandle;
renderpage_t *page = (renderpage_t*) pagehandle;
//DEBUG("PdfView(%p).drawpage(%p, %p)", this, doc, page);
fz_error error;
fz_matrix ctm;
fz_irect viewbox;
fz_pixmap *pixmap;
jfloat *matrix;
jint *viewboxarr;
jint *dimen;
jint *buffer;
int length, val;
pixmap = nil;
/* initialize parameter arrays for MuPDF */
ctm.a = 1;
ctm.b = 0;
ctm.c = 0;
ctm.d = 1;
ctm.e = 0;
ctm.f = 0;
// matrix = (*env)->GetPrimitiveArrayCritical(env, matrixarray, 0);
// ctm.a = matrix[0];
// ctm.b = matrix[1];
// ctm.c = matrix[2];
// ctm.d = matrix[3];
// ctm.e = matrix[4];
// ctm.f = matrix[5];
// (*env)->ReleasePrimitiveArrayCritical(env, matrixarray, matrix, 0);
// DEBUG("Matrix: %f %f %f %f %f %f",
// ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f);
// viewboxarr = (*env)->GetPrimitiveArrayCritical(env, viewboxarray, 0);
// viewbox.x0 = viewboxarr[0];
// viewbox.y0 = viewboxarr[1];
// viewbox.x1 = viewboxarr[2];
// viewbox.y1 = viewboxarr[3];
// (*env)->ReleasePrimitiveArrayCritical(env, viewboxarray, viewboxarr, 0);
// DEBUG("Viewbox: %d %d %d %d",
// viewbox.x0, viewbox.y0, viewbox.x1, viewbox.y1);
viewbox.x0 = 0;
viewbox.y0 = 0;
viewbox.x1 = 595;
viewbox.y1 = 841;
/* do the rendering */
DEBUG("doing the rendering...");
//buffer = (*env)->GetPrimitiveArrayCritical(env, bufferarray, 0);
// do the actual rendering:
error = fz_rendertree(&pixmap, doc->rast, page->page->tree,
ctm, viewbox, 1);
/* evil magic: we transform the rendered image's byte order
*/
int x, y;
if (bmpdata)
fz_free(bmpdata);
bmpstride = ((pixmap->w * 3 + 3) / 4) * 4;
bmpdata = fz_malloc(pixmap->h * bmpstride);
DEBUG("inside drawpage, bmpstride = %d, pixmap->w = %d, pixmap->h = %d\n", bmpstride, pixmap->w, pixmap->h);
if (!bmpdata)
return;
for (y = 0; y < pixmap->h; y++)
{
unsigned char *p = bmpdata + y * bmpstride;
unsigned char *s = pixmap->samples + y * pixmap->w * 4;
for (x = 0; x < pixmap->w; x++)
{
p[x * 3 + 0] = s[x * 4 + 3];
p[x * 3 + 1] = s[x * 4 + 2];
p[x * 3 + 2] = s[x * 4 + 1];
}
}
FILE* fp = fopen("/sdcard/drawpage", "wb");
fwrite(bmpdata, pixmap->h * bmpstride, 1, fp);
fclose(fp);
jbyteArray array = (*env)->NewByteArray(env, pixmap->h * bmpstride);
(*env)->SetByteArrayRegion(env, array, 0, pixmap->h * bmpstride, bmpdata);
// if(!error) {
// DEBUG("Converting image buffer pixel order");
// length = pixmap->w * pixmap->h;
// unsigned int *col = pixmap->samples;
// int c = 0;
// for(val = 0; val < length; val++) {
// col[val] = ((col[val] & 0xFF000000) >> 24) |
// ((col[val] & 0x00FF0000) >> 8) |
// ((col[val] & 0x0000FF00) << 8);
// }
// winconvert(pixmap);
// }
// (*env)->ReleasePrimitiveArrayCritical(env, bufferarray, buffer, 0);
fz_free(pixmap);
if (error) {
DEBUG("error!");
throw_exception(env, "error rendering page");
}
DEBUG("PdfView.drawPage() done");
return array;
}
I have compare the jpx output samples to the mupdf-1.5 windows, it is the same, but the colorspace of original jpx have gone.
Could help me to get the colorspace back?
It seems you are trying to use an old version of MuPDF with some bits pulled in from a more recent version. TO be honest that's hardly likely to work. I would also guess that its not the OpenJPEG library causing your problem, since the image appears, but converted to grayscale.
Have you tried opening the file in the current version of MuPDF ? Does it work ?
If so then it seems to me your correct approach should be to use the current code, not try and bolt pieces onto an older version.

How to get total CPU Idle Time in Objective C/C on OS X?

I need to get total CPU Idle Time in Objective C/C on OS X?
If possible please provide code example which do that.
Here is a code I use to get these metrics. As result percentage is not the same I have in Activity Monitor. So I assume CPU time calculation is incorrect:
#include <sys/sysctl.h>
#include <sys/types.h>
#include <mach/mach.h>
#include <mach/processor_info.h>
#include <mach/mach_host.h>
- (void)printCPUUsage
{
processor_cpu_load_info_t cpuLoad;
mach_msg_type_number_t processorMsgCount;
natural_t processorCount;
uint64_t totalSystemTime = 0, totalUserTime = 0, totalIdleTime = 0, totalCPUTime = 0;
kern_return_t err = host_processor_info(mach_host_self(), PROCESSOR_CPU_LOAD_INFO, &processorCount, (processor_info_array_t *)&cpuLoad, &processorMsgCount);
for (natural_t i = 0; i < processorCount; i++) {
// Calc load types and totals, with guards against 32-bit overflow
// (values are natural_t)
uint64_t system = 0, user = 0, idle = 0, total = 0;
system = cpuLoad[i].cpu_ticks[CPU_STATE_SYSTEM];
user = cpuLoad[i].cpu_ticks[CPU_STATE_USER];
idle = cpuLoad[i].cpu_ticks[CPU_STATE_IDLE];
total = system + user + idle;
if (total < 1) {
total = 1;
}
totalCPUTime += total;
totalSystemTime += system;
totalUserTime += user;
totalIdleTime += idle;
}
double onePercent = totalCPUTime/100.0f;
NSLog(#"system: %f", (double)totalSystemTime/(double)onePercent);
NSLog(#"user: %f", (double)totalUserTime/(double)onePercent);
NSLog(#"idle: %f", (double)totalIdleTime/(double)onePercent);
}
The values as returned from the process meter or top are, by default, sample delta based i.e. they calculate the CPU usage since the previous sample, rather than the absolute values.
This corresponds to the option -c n to top when called in the mode:
top -c n -l 0 | head -5
Which is the default mode. If you want the values as returned in your code, then you need to base the values on immediate samples, using:
top -c e -l 0 | head -5
These values will correspond to the values you're seeing.
If you want to get similar values to the process meter/top, then you need to take two samples, and display the values of the differences between them.
So for example, we create a structure containing the stats:
struct cpusample {
uint64_t totalSystemTime;
uint64_t totalUserTime;
uint64_t totalIdleTime;
};
we alter the printCPUUsage call so that it performs a sample:
void sample(struct cpusample *sample)
{
processor_cpu_load_info_t cpuLoad;
mach_msg_type_number_t processorMsgCount;
natural_t processorCount;
uint64_t totalSystemTime = 0, totalUserTime = 0, totalIdleTime = 0;
kern_return_t err = host_processor_info(mach_host_self(), PROCESSOR_CPU_LOAD_INFO, &processorCount, (processor_info_array_t *)&cpuLoad, &processorMsgCount);
for (natural_t i = 0; i < processorCount; i++) {
// Calc load types and totals, with guards against 32-bit overflow
// (values are natural_t)
uint64_t system = 0, user = 0, idle = 0;
system = cpuLoad[i].cpu_ticks[CPU_STATE_SYSTEM];
user = cpuLoad[i].cpu_ticks[CPU_STATE_USER] + cpuLoad[i].cpu_ticks[CPU_STATE_NICE];
idle = cpuLoad[i].cpu_ticks[CPU_STATE_IDLE];
totalSystemTime += system;
totalUserTime += user;
totalIdleTime += idle;
}
sample->totalSystemTime = totalSystemTime;
sample->totalUserTime = totalUserTime;
sample->totalIdleTime = totalIdleTime;
}
Then we take two samples (1 second between samples):
struct cpusample delta;
sample(&sample1);
sleep(1);
sample(&sample2);
deltasample.totalSystemTime = sample2.totalSystemTime - sample1.totalSystemTime;
deltasample.totalUserTime = sample2.totalUserTime - sample1.totalUserTime;
deltasample.totalIdleTime = sample2.totalIdleTime - sample1.totalIdleTime;
The add a printsample code:
void printSample(struct cpusample *sample)
{
uint64_t total = sample->totalSystemTime + sample->totalUserTime + sample->totalIdleTime;
double onePercent = total/100.0f;
NSLog(#"system: %f", (double)sample->totalSystemTime/(double)onePercent);
NSLog(#"user: %f", (double)sample->totalUserTime/(double)onePercent);
NSLog(#"idle: %f", (double)sample->totalIdleTime/(double)onePercent);
}
so when you invoke printSample(&deltasample) it prints the delta record, which gives a much more similar value to the one presented by either top or Activity Monitor.
But to be honest, I'd use host_statistics, as the code is cleaner:
void sample(struct cpusample *sample)
{
kern_return_t kr;
mach_msg_type_number_t count;
host_cpu_load_info_data_t r_load;
uint64_t totalSystemTime = 0, totalUserTime = 0, totalIdleTime = 0;
count = HOST_CPU_LOAD_INFO_COUNT;
kr = host_statistics(mach_host_self(), HOST_CPU_LOAD_INFO, (int *)&r_load, &count);
if (kr != KERN_SUCCESS) {
printf("oops: %s\n", mach_error_string(kr));
return;
}
sample->totalSystemTime = r_load.cpu_ticks[CPU_STATE_SYSTEM];
sample->totalUserTime = r_load.cpu_ticks[CPU_STATE_USER] + r_load.cpu_ticks[CPU_STATE_NICE];
sample->totalIdleTime = r_load.cpu_ticks[CPU_STATE_IDLE];
}

QuadTree or KD Tree for objective c? [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking for code must demonstrate a minimal understanding of the problem being solved. Include attempted solutions, why they didn't work, and the expected results. See also: Stack Overflow question checklist
Closed 9 years ago.
Improve this question
I'm looking a while for a decent piece of code to use in my app, in one of those algorithms.
I found this example: http://rosettacode.org/wiki/K-d_tree#C
But when I put the code in xcode, I get an errors, for example:
"use of undeclared identifier", "expected ';' at the end of declaration".
I guess a header file is missing?
I copied the code from the link and made a minor edit which moved
"swap" from being an inline nested function to a static function.
Compiled with "gcc -C99 file.c" and it compiled ok. So, no, it doesn't
need some include file. Maybe you mis pasted it.
If you are happy with this answer, you could accept it. Thanks.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define MAX_DIM 3
struct kd_node_t{
double x[MAX_DIM];
struct kd_node_t *left, *right;
};
inline double
dist(struct kd_node_t *a, struct kd_node_t *b, int dim)
{
double t, d = 0;
while (dim--) {
t = a->x[dim] - b->x[dim];
d += t * t;
}
return d;
}
static void swap(struct kd_node_t *x, struct kd_node_t *y) {
double tmp[MAX_DIM];
memcpy(tmp, x->x, sizeof(tmp));
memcpy(x->x, y->x, sizeof(tmp));
memcpy(y->x, tmp, sizeof(tmp));
}
/* see quickselect method */
struct kd_node_t*
find_median(struct kd_node_t *start, struct kd_node_t *end, int idx)
{
if (end <= start) return NULL;
if (end == start + 1)
return start;
struct kd_node_t *p, *store, *md = start + (end - start) / 2;
double pivot;
while (1) {
pivot = md->x[idx];
swap(md, end - 1);
for (store = p = start; p < end; p++) {
if (p->x[idx] < pivot) {
if (p != store)
swap(p, store);
store++;
}
}
swap(store, end - 1);
/* median has duplicate values */
if (store->x[idx] == md->x[idx])
return md;
if (store > md) end = store;
else start = store;
}
}
struct kd_node_t*
make_tree(struct kd_node_t *t, int len, int i, int dim)
{
struct kd_node_t *n;
if (!len) return 0;
if ((n = find_median(t, t + len, i))) {
i = (i + 1) % dim;
n->left = make_tree(t, n - t, i, dim);
n->right = make_tree(n + 1, t + len - (n + 1), i, dim);
}
return n;
}
/* global variable, so sue me */
int visited;
void nearest(struct kd_node_t *root, struct kd_node_t *nd, int i, int dim,
struct kd_node_t **best, double *best_dist)
{
double d, dx, dx2;
if (!root) return;
d = dist(root, nd, dim);
dx = root->x[i] - nd->x[i];
dx2 = dx * dx;
visited ++;
if (!*best || d < *best_dist) {
*best_dist = d;
*best = root;
}
/* if chance of exact match is high */
if (!*best_dist) return;
if (++i >= dim) i = 0;
nearest(dx > 0 ? root->left : root->right, nd, i, dim, best, best_dist);
if (dx2 >= *best_dist) return;
nearest(dx > 0 ? root->right : root->left, nd, i, dim, best, best_dist);
}
#define N 1000000
#define rand1() (rand() / (double)RAND_MAX)
#define rand_pt(v) { v.x[0] = rand1(); v.x[1] = rand1(); v.x[2] = rand1(); }
int main(void)
{
int i;
struct kd_node_t wp[] = {
{{2, 3}}, {{5, 4}}, {{9, 6}}, {{4, 7}}, {{8, 1}}, {{7, 2}}
};
struct kd_node_t this = {{9, 2}};
struct kd_node_t *root, *found, *million;
double best_dist;
root = make_tree(wp, sizeof(wp) / sizeof(wp[1]), 0, 2);
visited = 0;
found = 0;
nearest(root, &this, 0, 2, &found, &best_dist);
printf(">> WP tree\nsearching for (%g, %g)\n"
"found (%g, %g) dist %g\nseen %d nodes\n\n",
this.x[0], this.x[1],
found->x[0], found->x[1], sqrt(best_dist), visited);
million = calloc(N, sizeof(struct kd_node_t));
srand(time(0));
for (i = 0; i < N; i++) rand_pt(million[i]);
root = make_tree(million, N, 0, 3);
rand_pt(this);
visited = 0;
found = 0;
nearest(root, &this, 0, 3, &found, &best_dist);
printf(">> Million tree\nsearching for (%g, %g, %g)\n"
"found (%g, %g, %g) dist %g\nseen %d nodes\n",
this.x[0], this.x[1], this.x[2],
found->x[0], found->x[1], found->x[2],
sqrt(best_dist), visited);
/* search many random points in million tree to see average behavior.
tree size vs avg nodes visited:
10 ~ 7
100 ~ 16.5
1000 ~ 25.5
10000 ~ 32.8
100000 ~ 38.3
1000000 ~ 42.6
10000000 ~ 46.7 */
int sum = 0, test_runs = 100000;
for (i = 0; i < test_runs; i++) {
found = 0;
visited = 0;
rand_pt(this);
nearest(root, &this, 0, 3, &found, &best_dist);
sum += visited;
}
printf("\n>> Million tree\n"
"visited %d nodes for %d random findings (%f per lookup)\n",
sum, test_runs, sum/(double)test_runs);
// free(million);
return 0;
}