I have a quadratic optimization problem with linear constraints that I want to solve using SCIP. The optimization matrix that I want to be minimized is positive semi-definite (it is the variance of certain variables, to be precise). I have the problem in a file in CPLEX LP format and when I optimize in SCIP, I get the message
Quadratic constraint handler does not have LAPACK for eigenvalue computation. Will assume
that matrices (with size > 2x2) are indefinite.
So SCIP starts optimization assuming that the matrix is indefinite and takes a large amount of time. I have installed LAPACK and even copied liblapack.a file in the lib folder where the SCIP source and binaries are and reinstalled SCIP. But, I keep getting the above message.
Is there a way to make SCIP use the LAPACK library? I believe the optimization will be really fast, if SCIP can figure out that the matrix is positive semi-definite.
If you feel like patching up SCIP a bit to use your Lapack lib without providing a full Ipopt (though it's relatively easy to build on *nix and could help performance, as mattmilten pointed out), here is a patch that you could try out:
diff --git a/src/scip/cons_quadratic.c b/src/scip/cons_quadratic.c
index 93ba359..795bade 100644
--- a/src/scip/cons_quadratic.c
+++ b/src/scip/cons_quadratic.c
## -46,7 +46,7 ##
#include "scip/heur_trysol.h"
#include "scip/debug.h"
#include "nlpi/nlpi.h"
-#include "nlpi/nlpi_ipopt.h"
+/*#include "nlpi/nlpi_ipopt.h" */
/* constraint handler properties */
#define CONSHDLR_NAME "quadratic"
## -4257,6 +4257,71 ## void checkCurvatureEasy(
*determined = FALSE;
}
+#define F77_FUNC(a,A) a##_
+
+ /** LAPACK Fortran subroutine DSYEV */
+ void F77_FUNC(dsyev,DSYEV)(
+ char* jobz, /**< 'N' to compute eigenvalues only, 'V' to compute eigenvalues and eigenvectors */
+ char* uplo, /**< 'U' if upper triangle of A is stored, 'L' if lower triangle of A is stored */
+ int* n, /**< dimension */
+ double* A, /**< matrix A on entry; orthonormal eigenvectors on exit, if jobz == 'V' and info == 0; if jobz == 'N', then the matrix data is destroyed */
+ int* ldA, /**< leading dimension, probably equal to n */
+ double* W, /**< buffer for the eigenvalues in ascending order */
+ double* WORK, /**< workspace array */
+ int* LWORK, /**< length of WORK; if LWORK = -1, then the optimal workspace size is calculated and returned in WORK(1) */
+ int* info /**< == 0: successful exit; < 0: illegal argument at given position; > 0: failed to converge */
+ );
+
+/** Calls Lapacks Dsyev routine to compute eigenvalues and eigenvectors of a dense matrix.
+ */
+static
+SCIP_RETCODE LapackDsyev(
+ SCIP_Bool computeeigenvectors,/**< should also eigenvectors should be computed ? */
+ int N, /**< dimension */
+ SCIP_Real* a, /**< matrix data on input (size N*N); eigenvectors on output if computeeigenvectors == TRUE */
+ SCIP_Real* w /**< buffer to store eigenvalues (size N) */
+ )
+{
+ int INFO;
+ char JOBZ = computeeigenvectors ? 'V' : 'N';
+ char UPLO = 'L';
+ int LDA = N;
+ double* WORK = NULL;
+ int LWORK;
+ double WORK_PROBE;
+ int i;
+
+ /* First we find out how large LWORK should be */
+ LWORK = -1;
+ F77_FUNC(dsyev,DSYEV)(&JOBZ, &UPLO, &N, a, &LDA, w, &WORK_PROBE, &LWORK, &INFO);
+ if( INFO != 0 )
+ {
+ SCIPerrorMessage("There was an error when calling DSYEV. INFO = %d\n", INFO);
+ return SCIP_ERROR;
+ }
+
+ LWORK = (int) WORK_PROBE;
+ assert(LWORK > 0);
+
+ SCIP_ALLOC( BMSallocMemoryArray(&WORK, LWORK) );
+
+ for( i = 0; i < LWORK; ++i )
+ WORK[i] = i;
+
+ F77_FUNC(dsyev,DSYEV)(&JOBZ, &UPLO, &N, a, &LDA, w, WORK, &LWORK, &INFO);
+
+ BMSfreeMemoryArray(&WORK);
+
+ if( INFO != 0 )
+ {
+ SCIPerrorMessage("There was an error when calling DSYEV. INFO = %d\n", INFO);
+ return SCIP_ERROR;
+ }
+
+ return SCIP_OKAY;
+}
+
+
/** checks a quadratic constraint for convexity and/or concavity */
static
SCIP_RETCODE checkCurvature(
## -4343,7 +4408,7 ## SCIP_RETCODE checkCurvature(
return SCIP_OKAY;
}
- if( SCIPisIpoptAvailableIpopt() )
+ if( TRUE )
{
for( i = 0; i < consdata->nbilinterms; ++i )
{
## -4479,7 +4544,7 ## SCIP_RETCODE checkFactorable(
return SCIP_OKAY;
/* need routine to compute eigenvalues/eigenvectors */
- if( !SCIPisIpoptAvailableIpopt() )
+ if( !TRUE )
return SCIP_OKAY;
SCIP_CALL( consdataSortQuadVarTerms(scip, consdata) );
## -9395,7 +9460,7 ## SCIP_DECL_CONSINITSOL(consInitsolQuadratic)
SCIP_CALL( SCIPcatchEvent(scip, SCIP_EVENTTYPE_SOLFOUND, eventhdlr, (SCIP_EVENTDATA*)conshdlr, &conshdlrdata->newsoleventfilterpos) );
}
- if( nconss != 0 && !SCIPisIpoptAvailableIpopt() && !SCIPisInRestart(scip) )
+ if( nconss != 0 && !TRUE && !SCIPisInRestart(scip) )
{
SCIPverbMessage(scip, SCIP_VERBLEVEL_HIGH, NULL, "Quadratic constraint handler does not have LAPACK for eigenvalue computation. Will assume that matrices (with size > 2x2) are indefinite.\n");
}
Use USRLDFLAGS="-llapack -lblas" with make.
Currently, SCIP is only able to use LAPACK through Ipopt. There is usually a better performance on nonlinear problems when SCIP is compiled with Ipopt support, so it is definitely recommended. Run
make IPOPT=true
and make sure you have Ipopt installed beforehand.
Related
So I'm currently trying to write a kernel in OpenCL with the goal of sum reducing each row of a matrix (g_idata) into an array (g_odata). Said matrix is represented by a float array with column_count * row_count length, and the resulting array has a length of row_count. As such I've implemented the following kernel:
#define T float
#define Operation(X, Y) ((X) + (Y))
__kernel void marrow_kernel( __global T *g_odata,__global T *g_idata,
const unsigned long column_count, const unsigned long row_count, __local volatile T* sdata) {
size_t tid = get_local_id(0);
size_t gid = get_global_id(0);
size_t row = gid / column_count;
size_t column = gid % column_count;
if(row < row_count && column < column_count)
{
sdata[tid] = g_idata[gid];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(row < row_count && column < column_count)
{
size_t step = column_count / 2;
size_t limit = column_count;
while(step > 0)
{
if(column + step < limit) {
if(tid + step < get_local_size(0))
{
sdata[tid] = Operation(sdata[tid], sdata[tid + step]);
}
else if (gid + step < column_count * row_count)
{
sdata[tid] = Operation(sdata[tid], g_idata[gid + step]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
step /= 2;
limit /= 2;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(row < row_count && column == 0)
{
g_odata[row] = column_count % 2 == 0 ? sdata[tid] : sdata[tid] + g_idata[gid + (column_count - 1)];
}
}
Said kernel is currently being instantiated with a work-group of 128 work-units. I currently have no control over the size of the work-group.
Now here's the issue: If lets say I've a row that's shared between two different work-groups, it'll return the wrong result, since it'll fetch the value in the g_idata, since it's impossible to access the result of the next work-group local memory. After the first iteration, that's the wrong value, and it'll afect the final result of the operation.
Anyone can give me an hint on how to solve this problem?
Find the nth int with 10 set bits
n is an int in the range 0<= n <= 30 045 014
The 0th int = 1023, the 1st = 1535 and so on
snob() same number of bits,
returns the lowest integer bigger than n with the same number of set bits as n
int snob(int n) {
int a=n&-n, b=a+n;
return b|(n^b)/a>>2;
}
calling snob n times will work
int nth(int n){
int o =1023;
for(int i=0;i<n;i++)o=snob(o);
return o;
}
example
https://ideone.com/ikGNo7
Is there some way to find it faster?
I found one pattern but not sure if it's useful.
using factorial you can find the "indexes" where all 10 set bits are consecutive
1023 << x = the (x+10)! / (x! * 10!) - 1 th integer
1023<<1 is the 10th
1023<<2 is the 65th
1023<<3 the 285th
...
Btw I'm not a student and this is not homework.
EDIT:
Found an alternative to snob()
https://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
int lnbp(int v){
int t = (v | (v - 1)) + 1;
return t | ((((t & -t) / (v & -v)) >> 1) - 1);
}
I have built an implementation that should satisfy your needs.
/** A lookup table to see how many combinations preceeded this one */
private static int[][] LOOKUP_TABLE_COMBINATION_POS;
/** The number of possible combinations with i bits */
private static int[] NBR_COMBINATIONS;
static {
LOOKUP_TABLE_COMBINATION_POS = new int[Integer.SIZE][Integer.SIZE];
for (int bit = 0; bit < Integer.SIZE; bit++) {
// Ignore less significant bits, compute how many combinations have to be
// visited to set this bit, i.e.
// (bit = 4, pos = 5), before came 0b1XXX and 0b1XXXX, that's C(3, 3) + C(4, 3)
int nbrBefore = 0;
// The nth-bit can be only encountered after pos n
for (int pos = bit; pos < Integer.SIZE; pos++) {
LOOKUP_TABLE_COMBINATION_POS[bit][pos] = nbrBefore;
nbrBefore += nChooseK(pos, bit);
}
}
NBR_COMBINATIONS = new int[Integer.SIZE + 1];
for (int bits = 0; bits < NBR_COMBINATIONS.length; bits++) {
NBR_COMBINATIONS[bits] = nChooseK(Integer.SIZE, bits);
assert NBR_COMBINATIONS[bits] > 0; // Important for modulo check. Otherwise we must use unsigned arithmetic
}
}
private static int nChooseK(int n, int k) {
assert k >= 0 && k <= n;
if (k > n / 2) {
k = n - k;
}
long nCk = 1; // (N choose 0)
for (int i = 0; i < k; i++) {
// (N choose K+1) = (N choose K) * (n-k) / (k+1);
nCk *= (n - i);
nCk /= (i + 1);
}
return (int) nCk;
}
public static int nextCombination(int w, int n) {
// TODO: maybe for small n just advance naively
// Get the position of the current pattern w
int nbrBits = 0;
int position = 0;
while (w != 0) {
final int currentBit = Integer.lowestOneBit(w); // w & -w;
final int bitPos = Integer.numberOfTrailingZeros(currentBit);
position += LOOKUP_TABLE_COMBINATION_POS[nbrBits][bitPos];
// toggle off bit
w ^= currentBit;
nbrBits++;
}
position += n;
// Wrapping, optional
position %= NBR_COMBINATIONS[nbrBits];
// And reverse lookup
int v = 0;
int m = Integer.SIZE - 1;
while (nbrBits-- > 0) {
final int[] bitPositions = LOOKUP_TABLE_COMBINATION_POS[nbrBits];
// Search for largest bitPos such that position >= bitPositions[bitPos]
while (Integer.compareUnsigned(position, bitPositions[m]) < 0)
m--;
position -= bitPositions[m];
v ^= (0b1 << m--);
}
return v;
}
Now for some explanation. LOOKUP_TABLE_COMBINATION_POS[bit][pos] is the core of the algorithm that makes it as fast as it is. The table is designed so that a bit pattern with k bits at positions p_0 < p_1 < ... < p_{k - 1} has a position of `\sum_{i = 0}^{k - 1}{ LOOKUP_TABLE_COMBINATION_POS[i][p_i] }.
The intuition is that we try to move back the bits one by one until we reach the pattern where are all bits are at the lowest possible positions. Moving the i-th bit from position to k + 1 to k moves back by C(k-1, i-1) positions, provided that all lower bits are at the right-most position (no moving bits into or through each other) since we skip over all possible combinations with the i-1 bits in k-1 slots.
We can thus "decode" a bit pattern to a position, keeping track of the bits encountered. We then advance by n positions (rolling over in case we enumerated all possible positions for k bits) and encode this position again.
To encode a pattern, we reverse the process. For this, we move bits from their starting position forward, as long as the position is smaller than what we're aiming for. We could, instead of a linear search through LOOKUP_TABLE_COMBINATION_POS, employ a binary search for our target index m but it's hardly needed, the size of an int is not big. Nevertheless, we reuse our variant that a smaller bit must also come at a less significant position so that our algorithm is effectively O(n) where n = Integer.SIZE.
I remain with the following assertions to show the resulting algorithm:
nextCombination(0b1111111111, 1) == 0b10111111111;
nextCombination(0b1111111111, 10) == 0b11111111110;
nextCombination(0x00FF , 4) == 0x01EF;
nextCombination(0x7FFFFFFF , 4) == 0xF7FFFFFF;
nextCombination(0x03FF , 10) == 0x07FE;
// Correct wrapping
nextCombination(0b1 , 32) == 0b1;
nextCombination(0x7FFFFFFF , 32) == 0x7FFFFFFF;
nextCombination(0xFFFFFFEF , 5) == 0x7FFFFFFF;
Let us consider the numbers with k=10 bits set.
The trick is to determine the rank of the most significant one, for a given n.
There is a single number of length k: C(k, k)=1. There are k+1 = C(k+1, k) numbers of length k + 1. ... There are C(m, k) numbers of length m.
For k=10, the limit n are 1 + 10 + 55 + 220 + 715 + 2002 + 5005 + 11440 + ...
For a given n, you easily find the corresponding m. Then the problem is reduced to finding the n - C(m, k)-th number with k - 1 bits set. And so on recursively.
With precomputed tables, this can be very fast. 30045015 takes 30 lookups, so that I guess that the worst case is 29 x 30 / 2 = 435 lookups.
(This is based on linear lookups, to favor small values. By means of dichotomic search, you reduce this to less than 29 x lg(30) = 145 lookups at worse.)
Update:
My previous estimates were pessimistic. Indeed, as we are looking for k bits, there are only 10 determinations of m. In the linear case, at worse 245 lookups, in the dichotomic case, less than 50.
(I don't exclude off-by-one errors in the estimates, but clearly this method is very efficient and requires no snob.)
Is there any possibility to accelerate this simple kernel function? I have thought about using shared-memory but N is equal to 507904, so it is much more than shared memory array could be.
My program creates blocks of 256 threads each.
__global__ void compute(COMPLEX_TYPE *a, COMPLEX_TYPE *b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
F[i] = ( a[i].x*a[i].x + a[i].y*a[i].y + b[i].x*b[i].x + b[i].y*b[i].y) / (f);
}
}
The simplest general optimisation would be something like this:
__global__ void compute(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x;)
{
COMPLEX_TYPE aval = a[i], bval = b[i]
FLOAT_TYPE Fval;
Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) / (f);
F[i] = Fval;
}
}
[disclaimer: written in browser, not tested, use at own risk]
The idea here is to launch only as many threads as will execute concurrently on your target GPU, and then have every thread perform multiple operations rather than one. This helps amortise a lot of the fixed overhead at the block scheduler and setup code level and improve the overall efficiency. On most architectures, this will probably be memory bandwidth limited anyway, so memory coalescing and transaction optimisation is about the most important performance optimisation you will be able to make.
EDIT: Since this answer was marked CW, I elected to add my tests here, rather than create my own answer. If anyone objects to this, please just roll back the edit to a previous acceptable version. I'm not adding any new ideas, just testing those provided by #talonmies and #JanLucas
In my test case, the suggestions (excepting the unroll pragma) offered by #talonmies seem to give rise to a ~10% perf improvement. The suggestion by #JanLucas, to replace the floating-point divide with a floating point multiply, if acceptable, seem to give about a doubling of performance. This will obviously vary depending on GPU and other specifics. Here's my test:
$ cat t891.cu
#include <cuComplex.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 507904
#define nTPB 256
#define nBLK 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef cuFloatComplex COMPLEX_TYPE;
typedef float FLOAT_TYPE;
__global__ void compute(COMPLEX_TYPE *a, COMPLEX_TYPE *b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
F[i] = ( a[i].x*a[i].x + a[i].y*a[i].y + b[i].x*b[i].x + b[i].y*b[i].y) / (f);
}
}
__global__ void compute_imp(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// #pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x)
{
COMPLEX_TYPE aval = a[i];
COMPLEX_TYPE bval = b[i];
FLOAT_TYPE Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) / (f);
F[i] = Fval;
}
}
__global__ void compute_imp2(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// #pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x)
{
COMPLEX_TYPE aval = a[i];
COMPLEX_TYPE bval = b[i];
FLOAT_TYPE Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) * (f);
F[i] = Fval;
}
}
int main(){
COMPLEX_TYPE *d_A, *d_B;
FLOAT_TYPE *d_F, f = 4.0f;
cudaMalloc(&d_A, DSIZE*sizeof(COMPLEX_TYPE));
cudaMalloc(&d_B, DSIZE*sizeof(COMPLEX_TYPE));
cudaMalloc(&d_F, DSIZE*sizeof(FLOAT_TYPE));
//warm-up
compute<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t1 = dtime_usec(0);
compute<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
t1 = dtime_usec(t1);
//warm-up
compute_imp<<<DSIZE/(8*nTPB),nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t2 = dtime_usec(0);
compute_imp<<<nBLK,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
t2 = dtime_usec(t2);
//warm-up
compute_imp2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, 1/f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t3 = dtime_usec(0);
compute_imp2<<<nBLK,nTPB>>>(d_A, d_B, d_F, 1/f, DSIZE);
cudaDeviceSynchronize();
t3 = dtime_usec(t3);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)(USECPSEC), t3/(float)USECPSEC);
}
$ nvcc -O3 -o t891 t891.cu
$ ./t891
t1: 0.000226s, t2: 0.000209s, t3: 0.000110s
$
Notes:
The unroll pragma doesn't seem to help (it makes it run slower, for a few test cases I tried). The compiler already will, in some cases, unroll loops without a specific hint, and loop unrolling is generally an optimization that requires tuning, perhaps careful tuning.
The modification to the kernel proposed by #talonmies to create a grid-striding loop is one of the factors that would need to be taken into account to make a specific loop-unroll trip count useful. The overall grid dimension should be reduced by a factor equal to the unroll trip count, at least. However I wasn't able to find a "sweet spot".
I mostly tested on a Quadro5000 (Fermi cc2.0 GPU), CUDA 7.5RC, Fedora20. Certainly the behavior will be different on different GPUs, especially newer ones.
The nBLK parameter in this code is another "tunable" parameter, however I saw little variation with this when above about 64 or so. The best case might be to have a grid equal in size to the data.
I'm writing a program in OpenCL that receives two arrays of points, and calculates the nearest neighbour for each point.
I have two programs for this. One of them will calculate distance for 4 dimensions, and one for 6 dimensions. They are below:
4 dimensions:
kernel void BruteForce(
global read_only float4* m,
global float4* y,
global write_only ushort* i,
read_only uint mx)
{
int index = get_global_id(0);
float4 curY = y[index];
float minDist = MAXFLOAT;
ushort minIdx = -1;
int x = 0;
int mmx = mx;
for(x = 0; x < mmx; x++)
{
float dist = fast_distance(curY, m[x]);
if (dist < minDist)
{
minDist = dist;
minIdx = x;
}
}
i[index] = minIdx;
y[index] = minDist;
}
6 dimensions:
kernel void BruteForce(
global read_only float8* m,
global float8* y,
global write_only ushort* i,
read_only uint mx)
{
int index = get_global_id(0);
float8 curY = y[index];
float minDist = MAXFLOAT;
ushort minIdx = -1;
int x = 0;
int mmx = mx;
for(x = 0; x < mmx; x++)
{
float8 mx = m[x];
float d0 = mx.s0 - curY.s0;
float d1 = mx.s1 - curY.s1;
float d2 = mx.s2 - curY.s2;
float d3 = mx.s3 - curY.s3;
float d4 = mx.s4 - curY.s4;
float d5 = mx.s5 - curY.s5;
float dist = sqrt(d0 * d0 + d1 * d1 + d2 * d2 + d3 * d3 + d4 * d4 + d5 * d5);
if (dist < minDist)
{
minDist = dist;
minIdx = index;
}
}
i[index] = minIdx;
y[index] = minDist;
}
I'm looking for ways to optimize this program for GPGPU. I've read some articles (including http://www.macresearch.org/opencl_episode6, which comes with a source code) about GPGPU optimization by using local memory. I've tried applying it and came up with this code:
kernel void BruteForce(
global read_only float4* m,
global float4* y,
global write_only ushort* i,
__local float4 * shared)
{
int index = get_global_id(0);
int lsize = get_local_size(0);
int lid = get_local_id(0);
float4 curY = y[index];
float minDist = MAXFLOAT;
ushort minIdx = 64000;
int x = 0;
for(x = 0; x < {0}; x += lsize)
{
if((x+lsize) > {0})
lsize = {0} - x;
if ( (x + lid) < {0})
{
shared[lid] = m[x + lid];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int x1 = 0; x1 < lsize; x1++)
{
float dist = distance(curY, shared[x1]);
if (dist < minDist)
{
minDist = dist;
minIdx = x + x1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
i[index] = minIdx;
y[index] = minDist;
}
I'm getting garbage results for my 'i' output (e.g. many values that are the same). Can anyone point me to the right direction? I'll appreciate any answer that helps me improve this code, or maybe find the problem with the optimize version above.
Thank you very much
CauĂȘ
One way to get a big speed up here is to use local data structures and compute entire blocks of data at a time. You should also only need a single read/write global vector (float4). The same idea can be applied to the 6d version using smaller blocks. Each work group is able to work freely through the block of data it is crunching. I will leave the exact implementation to you because you will know the specifics of your application.
some pseudo-ish-code (4d):
computeBlockSize is the size of the blocks to read from global and crunch.
this value should be a multiple of your work group size. I like 64 as a WG
size; it tends to perform well on most platforms. will be
allocating 2 * float4 * computeBlockSize + uint * computeBlockSize of shared memory.
(max value for ocl 1.0 ~448, ocl 1.1 ~896)
#define computeBlockSize = 256
__local float4[computeBlockSize] blockA;
__local float4[computeBlockSize] blockB;
__local uint[computeBlockSize] blockAnearestIndex;
now blockA gets computed against all blockB combinations. this is the job of a single work group.
*important*: only blockA ever gets written to. blockB is stored in local memory, but never changed or copied back to global
steps:
load blockA into local memory with async_work_group_copy
blockA is located at get_group_id(0) * computeBlockSize in the global vector
optional: set all blockA 'w' values to MAXFLOAT
optional: load blockAnearestIndex into local memory with async_work_group_copy if needed
need to compute blockA against itself first, then go into the blockB's
be careful to only write to blockA[j], NOT blockA[k]. j is exclusive to this work item
for(j=get_local_id(0); j<computeBlockSize;j++)
for(k=0;k<computeBlockSize; k++)
if(j==k) continue; //no self-comparison
calculate distance of blockA[j] vs blockA[k]
store min distance in blockA[j].w
store global index (= i*computeBlockSize +k) of nearest in blockAnearestIndex[j]
barrier(local_mem_fence)
for (i=0;i<get_num_groups(0);i++)
if (i==get_group_id(0)) continue;
load blockB into local memory: async_work_group_copy(...)
for(j=get_local_id(0); j<computeBlockSize;j++)
for(k=0;k<computeBlockSize; k++)
calculate distance of blockA[j] vs blockB[k]
store min distance in blockA[j].w
store global index (= i*computeBlockSize +k) of nearest in blockAnearestIndex[j]
barrier(local_mem_fence)
write blockA and blockAnearestIndex to global memory using two async_work_group_copy
There should be no problem in reading a blockB while another work group writes the same block (as its own blockA), because only the W values may have changed. If there happens to be trouble with this -- or if you do require two different vectors of points, you could use two global vectors like you have above, one with the A's (writeable) and the other with the B's (read only).
This algorithm work best when your global data size is a multiple of computeBlockSize. To handle the edges, two solutions come to mind. I recommend writing a second kernel for the non-square edge blocks that would in a similar manner as above. The new kernel can execute after the first, and you could save the second pci-e transfer. Alternately, you can use a distance of -1 to signify a skip in the comparison of two elements (ie if either blockA[j].w == -1 or blockB[k].w == -1, continue). This second solution would result in a lot more branching in your kernel though, which is why I recommend writing a new kernel. A very small percentage of your data points will actually fall in a edge block.
I want to calculate the product A^T*A ( A is 2000x1000 Matrix). Also i only want to solve the upper triangular Matrix. In the inner loop i have to solve the dot product of two vectors.
Now, here is the problem. Using cblas ddot() is not faster than calculating the dot product with a loop. How is this possible? (using Intel Core (TM)i7 CPU M620 #2,67GHz, 1,92GB RAM)
The problem is caused essentially by matrix size, not by ddot. Your matrices are so large that they do not fit in the cache memory. The solution is to rearrange the three nested loops such that as much as possible can be done with a line in cache, so reducing cache refreshes. A model implementation follows for both the ddot and an daxpy approach. On my computer the time consumption was about 15:1.
In other words: never, never, never program a matrix multiplication along the "row times column" scheme that we learned in school.
/*
Matrix product of A^T * A by two methods.
1) "Row times column" as we learned in school.
2) With rearranged loops such that need for cash refreshes is reduced
(this can be improved even more).
Compile: gcc -o aT_a aT_a.c -lgslcblas -lblas -lm
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cblas.h>
#define ROWS 2000
#define COLS 1000
static double a[ROWS][COLS];
static double c[COLS][COLS];
static void dot() {
int i, j;
double *ai, *bj;
ai = a[0];
for (i=0; i<COLS; i++) {
bj = a[0];
for (j=0; j<COLS; j++) {
c[i][j] = cblas_ddot(ROWS,ai,COLS,bj,COLS);
bj += 1;
}
ai += 1;
}
}
static void axpy() {
int i, j;
double *ci, *bj, aij;
for (i=0; i<COLS; i++) {
ci = c[i];
for (j=0; j<COLS; j++) ci[j] = 0.;
for (j=0; j<ROWS; j++) {
aij = a[j][i];
bj = a[j];
cblas_daxpy(COLS,aij,bj,1,ci,1);
}
}
}
int main(int argc, char** argv) {
clock_t t0, t1;
int i, j;
for (i=0; i<ROWS; ++i)
for (j=0; j<COLS; ++j)
a[i][j] = i+j;
t0 = clock();
dot();
t0 = clock();
printf("Time for DOT : %f sec.\n",(double)t0/CLOCKS_PER_SEC);
axpy();
t1 = clock();
printf("Time for AXPY: %f sec.\n",(double)(t1-t0)/CLOCKS_PER_SEC);
return 0;
}
The CBLAS dot product is effectively just a computation in slightly unrolled loop. The netlib Fortran is just this:
DO I = MP1,N,5
DTEMP = DTEMP + DX(I)*DY(I) + DX(I+1)*DY(I+1) +
$ DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4)
END DO
ie. just a loop unrolled to a stride of 5.
If you must use a ddot style dot product for your operation, you might get a performance boost by re-writing your loop to use SSE2 intrinsics:
#include <emmintrin.h>
double ddotsse2(const double *x, const double *y, const int n)
{
double result[2];
int n2 = 2 * (n/2);
__m128d dtemp;
if ( (n % 2) == 0) {
dtemp = _mm_setzero_pd();
} else {
dtemp = _mm_set_sd(x[n] * y[n]);
}
for(int i=0; i<n2; i+=2) {
__m128d x1 = _mm_loadr_pd(x+i);
__m128d y1 = _mm_loadr_pd(y+i);
__m128d xy = _mm_mul_pd(x1, y1);
dtemp = _mm_add_pd(dtemp, xy);
}
_mm_store_pd(&result[0],dtemp);
return result[0] + result[1];
}
(not tested, never been compiled, buyer beware).
This may or may be faster than the standard BLAS implementation. You may also want to investigate whether further loop unrolling could improve performance.
If you're not using SSE2 intrinsics or using a data type that may not boost performance with them, you can try to transpose the matrix for an easy improvement in performance for larger matrix multiplications with cblas_?dot. Performing the matrix multiplication in blocks also helps.
void matMulDotProduct(int n, float *A, float* B, int a_size, int b_size, int a_row, int a_col, int b_row, int b_col, float *C) {
int i, j, k;
MKL_INT incx, incy;
incx = 1;
incy = b_size;
//copy out multiplying matrix from larger matrix
float *temp = (float*) malloc(n * n * sizeof(float));
for (i = 0; i < n; ++i) {
cblas_scopy(n, &B[(b_row * b_size) + b_col + i], incy, &temp[i * n], 1);
}
//transpose
mkl_simatcopy('R', 'T', n, n, 1.0, temp, 1, 1);
for (i = 0; i < n; i+= BLOCK_SIZE) {
for (j = 0; j < n; j++) {
for (k = 0; k < BLOCK_SIZE; ++k) {
C[((i + k) * n) + j] = cblas_sdot(n, &A[(a_row + i + k) * a_size + a_col], incx, &temp[n * j], 1);
}
}
}
free(temp);
}
On my machine, this code is about 1 order of magnitude faster than the the 3 loop code (but also 1 order of magnitude slower than cblas_?gemm call) for single precision floats and 2K by 2K matrices. (I'm using Intel MKL).