Binary search bit operation - binary-search
I was wondering which one of this two implementation is faster and why?
int N, A[N];
int binary_search(int val)
{
int i, step;
for (step = 1; step < N; step <<= 1)
;
for (i = 0; step; step >>= 1)
if (i + step < N && A[i + step] <= val)
i += step;
}
and a normal implementation where you find mid=(st+dr)/2 and then apply on the left side of the array or the right side depending on the value of A[mid] and your value?
Conversion of pseudo-code to C code
The question asks about a code fragment which is not a working implementation of a binary search function because it does not handle the case where the sought-for value is absent from the array.
The code can be converted to a working C code function like this:
int binary_search(int N, const int A[N], int val)
{
int i, step;
for (step = 1; step < N; step <<= 1)
;
for (i = 0; step; step >>= 1)
if (i + step < N && A[i + step] <= val)
i += step;
if (A[i] != val)
i = -1;
return i;
}
This has been tested in a rigorous test harness and it produces the equivalent answer to other variations of binary search tested in the same harness.
Comparison with alternative binary search algorithms
I happen to have 4 other implementations of binary search lurking in my folder of material related to SO questions. Given a program that can generate random numbers in a given range, and some supporting scripts, plus timing code that can report elapsed times to microseconds (it uses gettimeofday() on Mac OS X, much to the disgust of some, but that's good enough in this context), I generated this timing program. It includes algorithms:
BinSearch_A — find an arbitrary index P in array X[0..N-1] that matches T.
BinSearch_B — find the smallest index P in array X[0..N-1] that matches T.
BinSearch_C — find the largest index P in array X[0..N-1] that matches T.
BinSearch_D — find both smallest (L) and largest (U) indexes in array X[0..N-1] that match T.
BinSearch_E — find an arbitrary index P in array X[0..N-1] that matches T (using the algorithm in the question as amended above).
Note that algorithms B, C and D are solving strictly harder problems than A and E solve so it is to be expected that B, C, and D will be slower than A and E.
Outline code (binsearch-speed-1.c)
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct Pair
{
int lo;
int hi;
} Pair;
extern Pair BinSearch_D(int N, const int X[N], int T);
extern int BinSearch_A(int N, const int X[N], int T);
extern int BinSearch_B(int N, const int X[N], int T);
extern int BinSearch_C(int N, const int X[N], int T);
extern int BinSearch_E(int N, const int X[N], int T);
#ifndef lint
extern const char jlss_id_modbinsearch_c[];
const char jlss_id_modbinsearch_c[] = "#(#)$Id$";
#endif
int BinSearch_A(int N, const int X[N], int T)
{
int L = 0;
int U = N-1;
while (1)
{
if (L > U)
return -1;
int M = (L + U) / 2;
if (X[M] < T)
L = M + 1;
else if (X[M] > T)
U = M - 1;
else
return M;
}
assert(0);
}
int BinSearch_B(int N, const int X[N], int T)
{
int L = -1;
int U = N;
while (L + 1 != U)
{
int M = (L + U) / 2;
if (X[M] < T)
L = M;
else
U = M;
}
assert(L+1 == U && (L == -1 || X[L] < T) && (U >= N || X[U] >= T));
int P = U;
if (P >= N || X[P] != T)
P = -1;
return P;
}
int BinSearch_C(int N, const int X[N], int T)
{
int L = -1;
int U = N;
while (L + 1 != U)
{
int M = (L + U) / 2;
if (X[M] <= T)
L = M;
else
U = M;
}
assert(L+1 == U && (L == -1 || X[L] <= T) && (U >= N || X[U] > T));
int P = L;
if (P < 0 || X[P] != T)
P = -1;
return P;
}
Pair BinSearch_D(int N, const int X[N], int T)
{
int L_lo = -1;
int L_hi = N;
int U_lo = -1;
int U_hi = N;
while (L_lo + 1 != L_hi || U_lo + 1 != U_hi)
{
if (L_lo + 1 != L_hi)
{
int L_md = (L_lo + L_hi) / 2;
if (X[L_md] < T)
L_lo = L_md;
else
L_hi = L_md;
}
if (U_lo + 1 != U_hi)
{
int U_md = (U_lo + U_hi) / 2;
if (X[U_md] <= T)
U_lo = U_md;
else
U_hi = U_md;
}
}
assert(L_lo+1 == L_hi && (L_lo == -1 || X[L_lo] < T) && (L_hi >= N || X[L_hi] >= T));
int L = L_hi;
if (L >= N || X[L] != T)
L = -1;
assert(U_lo+1 == U_hi && (U_lo == -1 || X[U_lo] <= T) && (U_hi >= N || X[U_hi] > T));
int U = U_lo;
if (U < 0 || X[U] != T)
U = -1;
return (Pair) { .lo = L, .hi = U };
}
int BinSearch_E(int N, const int X[N], int T)
{
int i, step;
for (step = 1; step < N; step <<= 1)
;
for (i = 0; step; step >>= 1)
if (i + step < N && X[i + step] <= T)
i += step;
if (X[i] != T)
i = -1;
return i;
}
#include "timer.h"
static const int numbers[] =
{
10000, 10002, 10003, 10003, 10003, 10004, 10006, 10010, 10011, 10015,
10016, 10020, 10023, 10024, 10029, 10029, 10030, 10031, 10032, 10035,
10036, 10036, 10037, 10037, 10038, 10041, 10043, 10044, 10046, 10049,
10066, 10066, 10069, 10070, 10071, 10074, 10079, 10080, 10085, 10086,
10087, 10089, 10090, 10090, 10090, 10091, 10092, 10094, 10095, 10095,
…990 similar lines omitted…
29869, 29870, 29872, 29872, 29874, 29877, 29877, 29882, 29884, 29888,
29895, 29898, 29899, 29908, 29912, 29922, 29923, 29924, 29925, 29929,
29934, 29936, 29938, 29939, 29941, 29942, 29943, 29943, 29944, 29945,
29947, 29949, 29951, 29953, 29956, 29958, 29959, 29959, 29964, 29965,
29965, 29966, 29968, 29969, 29981, 29983, 29983, 29984, 29984, 29988,
};
enum { NUM_NUMBERS = sizeof(numbers) / sizeof(numbers[0]) };
static void check_sorted(const char *a_name, int size, const int array[size])
{
int ok = 1;
for (int i = 1; i < size; i++)
{
if (array[i-1] > array[i])
{
fprintf(stderr, "Out of order: %s[%d] = %d, %s[%d] = %d\n",
a_name, i-1, array[i-1], a_name, i, array[i]);
ok = 0;
}
}
if (!ok)
exit(1);
}
static int BinSearch_D1(int size, const int array[size], int value)
{
Pair p = BinSearch_D(size, array, value);
return p.lo;
}
typedef int (*BinSearch)(int size, const int data[size], int value);
static void time_search(const char *a_name, int size, const int array[size],
BinSearch function)
{
Clock clk;
clk_init(&clk);
int x0 = array[0] - 1;
int x1 = array[size-1] + 2;
long long vsum = 0;
clk_start(&clk);
for (int i = x0; i < x1; i++)
{
int index = (*function)(size, array, i);
vsum += (index == -1) ? index : array[index];
}
clk_stop(&clk);
char buffer[32];
printf("%s: (%d) %lld %s\n", a_name, size, vsum,
clk_elapsed_us(&clk, buffer, sizeof(buffer)));
}
int main(void)
{
check_sorted("numbers", NUM_NUMBERS, numbers);
for (int i = 0; i < 10; i++)
{
time_search("BinSearch_A", NUM_NUMBERS, numbers, BinSearch_A);
time_search("BinSearch_B", NUM_NUMBERS, numbers, BinSearch_B);
time_search("BinSearch_C", NUM_NUMBERS, numbers, BinSearch_C);
time_search("BinSearch_D", NUM_NUMBERS, numbers, BinSearch_D1);
time_search("BinSearch_E", NUM_NUMBERS, numbers, BinSearch_E);
}
return 0;
}
This code works with an array of 10,000 random numbers, with repeats, in the range 10,000 to 29,999. This means that approximately half the possible values in the range are present in the array. For each function, it computes the index of each value in the range from the smallest number in the array - 1 to the largest number in the array + 1. Because the algorithms legitimately return different indexes when there are multiple matches possible, the test code sums the found values (and subtracts 1 for each search failure). The output identifies the time taken in microseconds, and prints the array size and the computed value. One reason for the computation is to ensure the optimizer doesn't optimize too much.
I also generated a second program (binsearch-speed-2.c) with the same code but with 1,000,000 numbers in the range 1,000,000 to 3,000,000. Since the times for binsearch-speed-1.c were in the range 0.7 - 1.4 milliseconds, the amount of data was a bit smaller than I regard as comfortable, so I increased the problem size by 100 to generate correspondingly bigger times. However, the bigger scale problem changed the relative timings of the algorithms (which is the reason you get to see this).
The tests were run an oldish MacBook Pro (Early 2011) with 2.3 GHz Intel Core i7 CPU and 16 GiB of 1333 MHz DDR3 memory, running Mac OS X 10.11.4, and using GCC 5.3.0. Your mileage will vary!
Sample compilation command line:
$ gcc -O3 -g -I$HOME/inc -std=c11 -Wall -Wextra -Wmissing-prototypes -Wstrict-prototypes \
> -Wold-style-definition -Werror binsearch-speed-2.c -o binsearch-speed-2 \
> -L$HOME/lib/64 -ljl
$
The timing functions are in the library referenced.
Raw Results binsearch-speed-1 (Size 10,000)
BinSearch_A: (10000) 158341368 0.000817
BinSearch_B: (10000) 158341368 0.001076
BinSearch_C: (10000) 158341368 0.001006
BinSearch_D: (10000) 158341368 0.001337
BinSearch_E: (10000) 158341368 0.000787
BinSearch_A: (10000) 158341368 0.000771
BinSearch_B: (10000) 158341368 0.001540
BinSearch_C: (10000) 158341368 0.001003
BinSearch_D: (10000) 158341368 0.001344
BinSearch_E: (10000) 158341368 0.000791
BinSearch_A: (10000) 158341368 0.000799
BinSearch_B: (10000) 158341368 0.001078
BinSearch_C: (10000) 158341368 0.001008
BinSearch_D: (10000) 158341368 0.001386
BinSearch_E: (10000) 158341368 0.000802
BinSearch_A: (10000) 158341368 0.000774
BinSearch_B: (10000) 158341368 0.001083
BinSearch_C: (10000) 158341368 0.001176
BinSearch_D: (10000) 158341368 0.001495
BinSearch_E: (10000) 158341368 0.000907
BinSearch_A: (10000) 158341368 0.000817
BinSearch_B: (10000) 158341368 0.001080
BinSearch_C: (10000) 158341368 0.001007
BinSearch_D: (10000) 158341368 0.001357
BinSearch_E: (10000) 158341368 0.000786
BinSearch_A: (10000) 158341368 0.000756
BinSearch_B: (10000) 158341368 0.001080
BinSearch_C: (10000) 158341368 0.001899
BinSearch_D: (10000) 158341368 0.001644
BinSearch_E: (10000) 158341368 0.000791
BinSearch_A: (10000) 158341368 0.000770
BinSearch_B: (10000) 158341368 0.001087
BinSearch_C: (10000) 158341368 0.001014
BinSearch_D: (10000) 158341368 0.001378
BinSearch_E: (10000) 158341368 0.000793
BinSearch_A: (10000) 158341368 0.001415
BinSearch_B: (10000) 158341368 0.001160
BinSearch_C: (10000) 158341368 0.001006
BinSearch_D: (10000) 158341368 0.001336
BinSearch_E: (10000) 158341368 0.000786
BinSearch_A: (10000) 158341368 0.000763
BinSearch_B: (10000) 158341368 0.001079
BinSearch_C: (10000) 158341368 0.001012
BinSearch_D: (10000) 158341368 0.001309
BinSearch_E: (10000) 158341368 0.000796
BinSearch_A: (10000) 158341368 0.000769
BinSearch_B: (10000) 158341368 0.001094
BinSearch_C: (10000) 158341368 0.001029
BinSearch_D: (10000) 158341368 0.001397
BinSearch_E: (10000) 158341368 0.000800
Raw Results binsearch-speed-2 (Size 1,000,000)
BinSearch_A: (1000000) 1573140220897 0.081161
BinSearch_B: (1000000) 1573140220897 0.137057
BinSearch_C: (1000000) 1573140220897 0.132743
BinSearch_D: (1000000) 1573140220897 0.166290
BinSearch_E: (1000000) 1573140220897 0.189696
BinSearch_A: (1000000) 1573140220897 0.083374
BinSearch_B: (1000000) 1573140220897 0.136225
BinSearch_C: (1000000) 1573140220897 0.128654
BinSearch_D: (1000000) 1573140220897 0.168078
BinSearch_E: (1000000) 1573140220897 0.190977
BinSearch_A: (1000000) 1573140220897 0.083391
BinSearch_B: (1000000) 1573140220897 0.135630
BinSearch_C: (1000000) 1573140220897 0.131179
BinSearch_D: (1000000) 1573140220897 0.168578
BinSearch_E: (1000000) 1573140220897 0.188785
BinSearch_A: (1000000) 1573140220897 0.083069
BinSearch_B: (1000000) 1573140220897 0.135803
BinSearch_C: (1000000) 1573140220897 0.136248
BinSearch_D: (1000000) 1573140220897 0.170167
BinSearch_E: (1000000) 1573140220897 0.188973
BinSearch_A: (1000000) 1573140220897 0.084509
BinSearch_B: (1000000) 1573140220897 0.145219
BinSearch_C: (1000000) 1573140220897 0.129374
BinSearch_D: (1000000) 1573140220897 0.168213
BinSearch_E: (1000000) 1573140220897 0.186770
BinSearch_A: (1000000) 1573140220897 0.086911
BinSearch_B: (1000000) 1573140220897 0.141995
BinSearch_C: (1000000) 1573140220897 0.134353
BinSearch_D: (1000000) 1573140220897 0.169639
BinSearch_E: (1000000) 1573140220897 0.194442
BinSearch_A: (1000000) 1573140220897 0.082882
BinSearch_B: (1000000) 1573140220897 0.135095
BinSearch_C: (1000000) 1573140220897 0.129635
BinSearch_D: (1000000) 1573140220897 0.166059
BinSearch_E: (1000000) 1573140220897 0.186700
BinSearch_A: (1000000) 1573140220897 0.083190
BinSearch_B: (1000000) 1573140220897 0.134491
BinSearch_C: (1000000) 1573140220897 0.130103
BinSearch_D: (1000000) 1573140220897 0.169454
BinSearch_E: (1000000) 1573140220897 0.188583
BinSearch_A: (1000000) 1573140220897 0.083038
BinSearch_B: (1000000) 1573140220897 0.135738
BinSearch_C: (1000000) 1573140220897 0.129727
BinSearch_D: (1000000) 1573140220897 0.169101
BinSearch_E: (1000000) 1573140220897 0.188749
BinSearch_A: (1000000) 1573140220897 0.082099
BinSearch_B: (1000000) 1573140220897 0.135025
BinSearch_C: (1000000) 1573140220897 0.130743
BinSearch_D: (1000000) 1573140220897 0.168684
BinSearch_E: (1000000) 1573140220897 0.188640
Statistics on raw data
Program Algorithm Tests Avg Time Std Dev
binsearch-speed-1 BinSearch_A 10 0.0008451 0.0002014
BinSearch_B 10 0.0011357 0.0001442
BinSearch_C 10 0.0011160 0.0002801
BinSearch_D 10 0.0013983 0.0001003
BinSearch_E 10 0.0008039 0.0000366
binsearch-speed-2 BinSearch_A 10 0.0833624 0.0015203
BinSearch_B 10 0.1372278 0.0035168
BinSearch_C 10 0.1312759 0.0024403
BinSearch_D 10 0.1684263 0.0013514
BinSearch_E 10 0.1892315 0.0022148
Provisional conclusions
When the problem size was ten thousand, then the 'shift' algorithm (BinSearch_E) seemed to perform a little better than the simple 'midpoint' algorithm (BinSearch_A), but the difference was not obviously significant — I've not run a T-Test or similar on the data.
When the problem size was one million, the 'midpoint' algorithm was better than than the 'shift' algorithm by a significant margin, and indeed it performed worse than the three more complex algorithms.
This was unexpected (especially being worse than the more complex algorithms); I expected the two algorithms to essentially the same. I don't have a good explanation for why. This sort of result shows why benchmarking is hard. If I had to guess, I'd suspect that despite only need about 4 MiB of memory for the bigger array, the access pattern on the elements for the 'shift' algorithm means that the caches are less effective. Proving (or disproving) that would be hard — it would require a better performance testing engineer than I am.
Given the consistent performance of Algorithm A (simple midpoint), which scaled essentially linearly for the tests performed, I would use that rather than Algorithm E — unless I could demonstrate that Algorithm E (shift) gave better performance on the size of data sets that the code was working with.
And this shows why benchmarking is important if the utmost in performance is important.
Related
how to do 3d sum using openmp
I am a freshman in openmp. I have some trouble in a 3d sum, and I don't know how to improve my code. Here's the code I want to improve in openmp. My aim is to speed up the calculation of this 3d sum. What should I add in my code according to the rules of openmp? I add #pragma omp parallel for reduction(+:integral) in my code. But an error happens which says the initialization of 'for' is not correct. This is the information of this error:enter image description here I am a chinese, so the language of my IDE is chinese. I use Visual Studio 2019. #include<omp.h> #include<stdio.h> #include<math.h> int main() { double a = 0.3291; double d_title = 2.414; double b = 3.8037; double c = 4086; double nu_start = 0; double mu_start = 0; double z_start = 0; double step_nu = 2 * 3.1415926 / 100; double step_mu = 3.1415926 / 100; double step_z = 0; double nu = 0; double mu = 0; double z = 0; double integral=0; double d_uv = 0; int i = 0; int j = 0; int k = 0; #pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu) private( j,k,mu, nu, step_z, z, d_uv) reduction(+:integral) for (i = 0; i < 100; i++) { mu = mu_start + (i + 1) * step_mu; for (j = 0; j < 100; j++) { nu = nu_start + (j + 1) * step_nu; for (k = 0; k < 500; k++) { d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a * sin(nu) - d_title * cos(nu)) * (a * sin(nu) - d_title * cos(nu)) + b * b * cos(mu) * cos(mu)) / (c * c); step_z = 20 / (d_uv * 500); z = z_start + (k + 1) * step_z; integral = integral + sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu; } } } double out = 0; out = integral / (c * c); return 0; }
Solutions (UPDATE: It is an answer to the original question:) To do the least typing you just have to add the following line before for(int i=..) #pragma omp parallel for private( mu, nu, step_z, z, d_uv) reduction(+:integral) Here you define which variables have to be private to avoid data race. Note that variables are shared by default, so variable integral also shared, but all threads update its value, which is a data race. To avoid it, you have 2 possibilities: use atomic operation, or a much better option is to use use reduction (add reduction(+:integral) clause). As you mentioned that you are beginner in OpenMP it is recommended to use default(none) clause in the #pragma omp parallel for directive, so you have to explicitly define sharing attributes. If you forget a variable you will get an error, so you have to consider all variables involved in your parallel region and can think about possible data races: #pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu) private( mu, nu, step_z, z, d_uv) reduction(+:integral) Generally, it is recommended to define your variables in their minimum required scope, so variables defined inside the for loop to parallelize will be private. In this case you just have to add #pragma omp parallel for reduction(+:integral) before your outermost for loop, so your code will be: #pragma omp parallel for reduction(+:integral) for (int i = 0; i < 100; i++) { double mu = mu_start + (i + 1) * step_mu; for (int j = 0; j < 100; j++) { //int id = omp_get_thread_num(); double nu = nu_start + (j + 1) * step_nu; for (int k = 0; k < 500; k++) { double d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a * sin(nu) - d_title * cos(nu)) * (a * sin(nu) - d_title * cos(nu)) + b * b * cos(mu) * cos(mu)) / (c * c); double step_z = 20 / (d_uv * 500); double z = z_start + (k + 1) * step_z; //int id = omp_get_thread_num(); integral = integral + sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu; } } } Runtimes: 44 ms (1 thread) and 11 ms (4 threads) on my computer (g++ -O3 -mavx2 -fopenmp).
OpenCL Sum reduction across different work-groups gives the wrong result
So I'm currently trying to write a kernel in OpenCL with the goal of sum reducing each row of a matrix (g_idata) into an array (g_odata). Said matrix is represented by a float array with column_count * row_count length, and the resulting array has a length of row_count. As such I've implemented the following kernel: #define T float #define Operation(X, Y) ((X) + (Y)) __kernel void marrow_kernel( __global T *g_odata,__global T *g_idata, const unsigned long column_count, const unsigned long row_count, __local volatile T* sdata) { size_t tid = get_local_id(0); size_t gid = get_global_id(0); size_t row = gid / column_count; size_t column = gid % column_count; if(row < row_count && column < column_count) { sdata[tid] = g_idata[gid]; } barrier(CLK_LOCAL_MEM_FENCE); if(row < row_count && column < column_count) { size_t step = column_count / 2; size_t limit = column_count; while(step > 0) { if(column + step < limit) { if(tid + step < get_local_size(0)) { sdata[tid] = Operation(sdata[tid], sdata[tid + step]); } else if (gid + step < column_count * row_count) { sdata[tid] = Operation(sdata[tid], g_idata[gid + step]); } } barrier(CLK_LOCAL_MEM_FENCE); step /= 2; limit /= 2; } } barrier(CLK_LOCAL_MEM_FENCE); if(row < row_count && column == 0) { g_odata[row] = column_count % 2 == 0 ? sdata[tid] : sdata[tid] + g_idata[gid + (column_count - 1)]; } } Said kernel is currently being instantiated with a work-group of 128 work-units. I currently have no control over the size of the work-group. Now here's the issue: If lets say I've a row that's shared between two different work-groups, it'll return the wrong result, since it'll fetch the value in the g_idata, since it's impossible to access the result of the next work-group local memory. After the first iteration, that's the wrong value, and it'll afect the final result of the operation. Anyone can give me an hint on how to solve this problem?
OpenCL kernel doesn't finish executing
I am writing a simple monte carlo code for simulation of electron scattering. I ran the Kernel for 10 million electron and it runs fine, but when I increase the number of electrons to a higher number, say 50 million, the code just wouldn't finish and the computer freezes. I wanted to know if this is a hardware issue or if there is a possible bug in the code. I am running the code on a iMac with ATI Radeon HD 5870. int rand_r (unsigned int seed) { unsigned int next = seed; int result; next *= 1103515245; next += 12345; result = (unsigned int) (next / 65536) % 2048; next *= 1103515245; next += 12345; result <<= 10; result ^= (unsigned int) (next / 65536) % 1024; next *= 1103515245; next += 12345; result <<= 10; result ^= (unsigned int) (next / 65536) % 1024; seed = next; return result; } __kernel void MC(const float E, __global float* bse, const int count) { int tx, ty; tx = get_global_id(0); ty = get_global_id(1); float RAND_MAX = 2147483647.0f; int rand_seed; int seed = count*ty + tx; float rand; float PI; PI = 3.14159f; float z; z = 28.0f; float rho; rho = 8.908f; float A; A = 58.69f; int num; num = 10000000/(count*count); int counter, counter1, counter2; counter = 0; float4 c_new, r_new; float E_new, alpha, de_ds, phi, psi, mfp,sig_eNA,step, dsq, dsqi, absc0z; float J; J = (9.76f*z + 58.5f*powr(z,-0.19f))*1E-3f; float4 r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f); float2 tilt = (float2)((70.0f/180.0f)*PI , 0.0f); float4 c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f); for (int i = 0; i < num; ++i){ rand_seed = rand_r(seed); seed = rand_seed; rand = rand_seed/RAND_MAX; //some random no. generator in gpu r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f); c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f); E_new = E; c_new = c0; alpha = (3.4E-3f)*powr(z,0.67f)/E_new; sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4.0f*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f))); mfp = A/(rho*sig_eNA); step = -mfp * log(rand); r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f); r0 = r_new; counter1 = 0; counter2 = 0; while (counter1 < 1000){ alpha = (3.4E-3f)*powr(z,0.67f)/E_new; sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f))); mfp = A/(rho*sig_eNA); rand_seed = rand_r(seed); seed = rand_seed; rand = rand_seed/RAND_MAX; //some random no. generator in gpu step = -mfp * log(rand); de_ds = -78500.0f*(z/(A*E_new)) * log((1.66f*(E_new + 0.85f*J))/J); rand_seed = rand_r(seed); seed = rand_seed; rand = rand_seed/RAND_MAX; //new random no. phi = acos(1 - ((2*alpha*rand)/(1 + alpha - rand))); rand_seed = rand_r(seed); seed = rand_seed; rand = rand_seed/RAND_MAX; //third random no. psi = 2*PI*rand; if ((c0.z >= 0.999f) || (c0.z <= -0.999f) ){ absc0z = abs(c0.z); c_new = (float4)(sin(phi) * cos(psi), sin(phi) * sin(psi), (c0.z/absc0z)*cos(phi), 0.0f); } else { dsq = sqrt(1-c0.z*c0.z); dsqi = 1/dsq; c_new = (float4)(sin(phi)*(c0.x*c0.z*cos(psi) - c0.y*sin(psi))*dsqi + c0.x*cos(phi), sin(phi) * (c0.y * c0.z * cos(psi) + c0.x * sin(psi)) * dsqi + c0.y * cos(phi), -sin(phi) * cos(psi) * dsq + c0.z * cos(phi), 0.0f); } r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f); r0 = r_new; c0 = c_new; E_new += step*rho*de_ds; if (r0.z <= 0 && counter2 == 0){ counter++ ; counter2 = 1; } counter1++ ; } } bse[count*ty + tx] = counter; }
Setup the accelerator framework for fft on the iPhone
I have set a function to setup the accelerator, after i have read : Using the Apple FFT and Accelerate Framework iPhone FFT with Accelerate framework vDSP and apple docs. i did this : void fftSetup() { COMPLEX_SPLIT A; FFTSetup setupReal; uint32_t log2n; uint32_t n, nOver2; int32_t stride; uint32_t i; float *originalReal, *obtainedReal; float scale; uint32_t L = 1024; float *mag = new float[L/2]; log2n = 10 ; n = 1 << log2n; stride = 1; nOver2 = n / 2; printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n); for (i = 0; i < n; i++) originalReal[i] = (float) (i + 1); vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2); A.realp = (float *) malloc(nOver2 * sizeof(float)); A.imagp = (float *) malloc(nOver2 * sizeof(float)); setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2); vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD); vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE); //get magnitude; for(i = 1; i < L/2; i++){ mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]); } scale = (float) 1.0 / (2 * n); vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2); vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2); } questions : my app is always crash with no error(BAD ACCESS) on one of this 2 lines : originalReal[i] = (float) (i + 1); // or vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2); i guess i did not set a good value for log2n ? (10 to get 1024 window ? ) how do i get the real magnitude of the bins? my actual fft? the same i wrote here ? where do i input MY data buffer array (exactly where in my code ? instead originalReal?) thanks a lot.
I actually manage to make it work ,when i insert into it a sin wave of a certain f. This is the code : COMPLEX_SPLIT A; FFTSetup setupReal; uint32_t log2n; uint32_t n, nOver2; int32_t stride; uint32_t i; float *originalReal, *obtainedReal; float scale; uint32_t L = 1024; float *mag = new float[L/2]; log2n = 10 ; n = 1 << log2n; stride = 1; nOver2 = n / 2; //printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n); A.realp = (float *) malloc(nOver2 * sizeof(float)); A.imagp = (float *) malloc(nOver2 * sizeof(float)); originalReal = (float *) malloc(n * sizeof(float)); obtainedReal = (float *) malloc(n * sizeof(float)); for (i = 0; i < n; i++) originalReal[i] = cos(2*3.141592*11000*i/44100);//(float) (i + 1); vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2); setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2); vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD); //vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE); scale = (float) 1.0 / (2 * n); vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2); vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2); //get magnitude; for(i = 1; i < L/2; i++) { mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]); NSLog(#"%d:%f",i,mag[i]); } Actually its not 44hz between bins,as the guy wrote in the post above! but 43 ! 22050/512=43 . this thing is critical ! because in the higher bins- such as bin[300] you get a completely different resault for 44 and 43 ! (its 300hz drift). so take care of that .
dot product using cblas is slow
I want to calculate the product A^T*A ( A is 2000x1000 Matrix). Also i only want to solve the upper triangular Matrix. In the inner loop i have to solve the dot product of two vectors. Now, here is the problem. Using cblas ddot() is not faster than calculating the dot product with a loop. How is this possible? (using Intel Core (TM)i7 CPU M620 #2,67GHz, 1,92GB RAM)
The problem is caused essentially by matrix size, not by ddot. Your matrices are so large that they do not fit in the cache memory. The solution is to rearrange the three nested loops such that as much as possible can be done with a line in cache, so reducing cache refreshes. A model implementation follows for both the ddot and an daxpy approach. On my computer the time consumption was about 15:1. In other words: never, never, never program a matrix multiplication along the "row times column" scheme that we learned in school. /* Matrix product of A^T * A by two methods. 1) "Row times column" as we learned in school. 2) With rearranged loops such that need for cash refreshes is reduced (this can be improved even more). Compile: gcc -o aT_a aT_a.c -lgslcblas -lblas -lm */ #include <stdio.h> #include <stdlib.h> #include <time.h> #include <cblas.h> #define ROWS 2000 #define COLS 1000 static double a[ROWS][COLS]; static double c[COLS][COLS]; static void dot() { int i, j; double *ai, *bj; ai = a[0]; for (i=0; i<COLS; i++) { bj = a[0]; for (j=0; j<COLS; j++) { c[i][j] = cblas_ddot(ROWS,ai,COLS,bj,COLS); bj += 1; } ai += 1; } } static void axpy() { int i, j; double *ci, *bj, aij; for (i=0; i<COLS; i++) { ci = c[i]; for (j=0; j<COLS; j++) ci[j] = 0.; for (j=0; j<ROWS; j++) { aij = a[j][i]; bj = a[j]; cblas_daxpy(COLS,aij,bj,1,ci,1); } } } int main(int argc, char** argv) { clock_t t0, t1; int i, j; for (i=0; i<ROWS; ++i) for (j=0; j<COLS; ++j) a[i][j] = i+j; t0 = clock(); dot(); t0 = clock(); printf("Time for DOT : %f sec.\n",(double)t0/CLOCKS_PER_SEC); axpy(); t1 = clock(); printf("Time for AXPY: %f sec.\n",(double)(t1-t0)/CLOCKS_PER_SEC); return 0; }
The CBLAS dot product is effectively just a computation in slightly unrolled loop. The netlib Fortran is just this: DO I = MP1,N,5 DTEMP = DTEMP + DX(I)*DY(I) + DX(I+1)*DY(I+1) + $ DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4) END DO ie. just a loop unrolled to a stride of 5. If you must use a ddot style dot product for your operation, you might get a performance boost by re-writing your loop to use SSE2 intrinsics: #include <emmintrin.h> double ddotsse2(const double *x, const double *y, const int n) { double result[2]; int n2 = 2 * (n/2); __m128d dtemp; if ( (n % 2) == 0) { dtemp = _mm_setzero_pd(); } else { dtemp = _mm_set_sd(x[n] * y[n]); } for(int i=0; i<n2; i+=2) { __m128d x1 = _mm_loadr_pd(x+i); __m128d y1 = _mm_loadr_pd(y+i); __m128d xy = _mm_mul_pd(x1, y1); dtemp = _mm_add_pd(dtemp, xy); } _mm_store_pd(&result[0],dtemp); return result[0] + result[1]; } (not tested, never been compiled, buyer beware). This may or may be faster than the standard BLAS implementation. You may also want to investigate whether further loop unrolling could improve performance.
If you're not using SSE2 intrinsics or using a data type that may not boost performance with them, you can try to transpose the matrix for an easy improvement in performance for larger matrix multiplications with cblas_?dot. Performing the matrix multiplication in blocks also helps. void matMulDotProduct(int n, float *A, float* B, int a_size, int b_size, int a_row, int a_col, int b_row, int b_col, float *C) { int i, j, k; MKL_INT incx, incy; incx = 1; incy = b_size; //copy out multiplying matrix from larger matrix float *temp = (float*) malloc(n * n * sizeof(float)); for (i = 0; i < n; ++i) { cblas_scopy(n, &B[(b_row * b_size) + b_col + i], incy, &temp[i * n], 1); } //transpose mkl_simatcopy('R', 'T', n, n, 1.0, temp, 1, 1); for (i = 0; i < n; i+= BLOCK_SIZE) { for (j = 0; j < n; j++) { for (k = 0; k < BLOCK_SIZE; ++k) { C[((i + k) * n) + j] = cblas_sdot(n, &A[(a_row + i + k) * a_size + a_col], incx, &temp[n * j], 1); } } } free(temp); } On my machine, this code is about 1 order of magnitude faster than the the 3 loop code (but also 1 order of magnitude slower than cblas_?gemm call) for single precision floats and 2K by 2K matrices. (I'm using Intel MKL).