Is there any possibility to accelerate this simple kernel function? I have thought about using shared-memory but N is equal to 507904, so it is much more than shared memory array could be.
My program creates blocks of 256 threads each.
__global__ void compute(COMPLEX_TYPE *a, COMPLEX_TYPE *b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
F[i] = ( a[i].x*a[i].x + a[i].y*a[i].y + b[i].x*b[i].x + b[i].y*b[i].y) / (f);
}
}
The simplest general optimisation would be something like this:
__global__ void compute(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
#pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x;)
{
COMPLEX_TYPE aval = a[i], bval = b[i]
FLOAT_TYPE Fval;
Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) / (f);
F[i] = Fval;
}
}
[disclaimer: written in browser, not tested, use at own risk]
The idea here is to launch only as many threads as will execute concurrently on your target GPU, and then have every thread perform multiple operations rather than one. This helps amortise a lot of the fixed overhead at the block scheduler and setup code level and improve the overall efficiency. On most architectures, this will probably be memory bandwidth limited anyway, so memory coalescing and transaction optimisation is about the most important performance optimisation you will be able to make.
EDIT: Since this answer was marked CW, I elected to add my tests here, rather than create my own answer. If anyone objects to this, please just roll back the edit to a previous acceptable version. I'm not adding any new ideas, just testing those provided by #talonmies and #JanLucas
In my test case, the suggestions (excepting the unroll pragma) offered by #talonmies seem to give rise to a ~10% perf improvement. The suggestion by #JanLucas, to replace the floating-point divide with a floating point multiply, if acceptable, seem to give about a doubling of performance. This will obviously vary depending on GPU and other specifics. Here's my test:
$ cat t891.cu
#include <cuComplex.h>
#include <stdio.h>
#include <stdlib.h>
#define DSIZE 507904
#define nTPB 256
#define nBLK 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef cuFloatComplex COMPLEX_TYPE;
typedef float FLOAT_TYPE;
__global__ void compute(COMPLEX_TYPE *a, COMPLEX_TYPE *b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
F[i] = ( a[i].x*a[i].x + a[i].y*a[i].y + b[i].x*b[i].x + b[i].y*b[i].y) / (f);
}
}
__global__ void compute_imp(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// #pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x)
{
COMPLEX_TYPE aval = a[i];
COMPLEX_TYPE bval = b[i];
FLOAT_TYPE Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) / (f);
F[i] = Fval;
}
}
__global__ void compute_imp2(const COMPLEX_TYPE * __restrict__ a,
const COMPLEX_TYPE * __restrict__ b,
FLOAT_TYPE *F, FLOAT_TYPE f, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
// #pragma unroll 8
for(; i < N; i += blockDim.x * gridDim.x)
{
COMPLEX_TYPE aval = a[i];
COMPLEX_TYPE bval = b[i];
FLOAT_TYPE Fval = ( aval.x*aval.x + aval.y*aval.y + bval.x*bval.x + bval.y*bval.y) * (f);
F[i] = Fval;
}
}
int main(){
COMPLEX_TYPE *d_A, *d_B;
FLOAT_TYPE *d_F, f = 4.0f;
cudaMalloc(&d_A, DSIZE*sizeof(COMPLEX_TYPE));
cudaMalloc(&d_B, DSIZE*sizeof(COMPLEX_TYPE));
cudaMalloc(&d_F, DSIZE*sizeof(FLOAT_TYPE));
//warm-up
compute<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t1 = dtime_usec(0);
compute<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
t1 = dtime_usec(t1);
//warm-up
compute_imp<<<DSIZE/(8*nTPB),nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t2 = dtime_usec(0);
compute_imp<<<nBLK,nTPB>>>(d_A, d_B, d_F, f, DSIZE);
cudaDeviceSynchronize();
t2 = dtime_usec(t2);
//warm-up
compute_imp2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_A, d_B, d_F, 1/f, DSIZE);
cudaDeviceSynchronize();
unsigned long long t3 = dtime_usec(0);
compute_imp2<<<nBLK,nTPB>>>(d_A, d_B, d_F, 1/f, DSIZE);
cudaDeviceSynchronize();
t3 = dtime_usec(t3);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)(USECPSEC), t3/(float)USECPSEC);
}
$ nvcc -O3 -o t891 t891.cu
$ ./t891
t1: 0.000226s, t2: 0.000209s, t3: 0.000110s
$
Notes:
The unroll pragma doesn't seem to help (it makes it run slower, for a few test cases I tried). The compiler already will, in some cases, unroll loops without a specific hint, and loop unrolling is generally an optimization that requires tuning, perhaps careful tuning.
The modification to the kernel proposed by #talonmies to create a grid-striding loop is one of the factors that would need to be taken into account to make a specific loop-unroll trip count useful. The overall grid dimension should be reduced by a factor equal to the unroll trip count, at least. However I wasn't able to find a "sweet spot".
I mostly tested on a Quadro5000 (Fermi cc2.0 GPU), CUDA 7.5RC, Fedora20. Certainly the behavior will be different on different GPUs, especially newer ones.
The nBLK parameter in this code is another "tunable" parameter, however I saw little variation with this when above about 64 or so. The best case might be to have a grid equal in size to the data.
Related
I have created a program to test GPU RAM bandwidth and have got several strange effects on it.
The one is that straigtforward implementation is not purely memory bounded. If I unroll the
for (int *p = pStart; p < pEnd; p += shift)
sum += *p;
loop by adding
p += shift; sum += *p;
p += shift; sum += *p;
p += shift; sum += *p;
into the loop body, the speed raises by 20%, from 183 to 222 GB/s on GeForce 2060 RTX (theoretical bandwidth 264 GB/s).
Shouldn't such things should be hidden by memory latency?
Or in such program in an ideal case all warps only should await for data from memory and every additional calculation a warp done between such waits make memory bus underloaded?
I analyzed executables with NVidia Nsight Compute 2021.3.0, it reports less compute throughput and higher memory throughput. Cache usage is negligible. This all is logical.
The results in schedulers statistics are more interesting and I don't understand the meaning of these numbers. Could someone explain it please? I am not a newbie in CUDA, I understand that warps can move through pipeline or stall because of awaiting data from RAM, but I don't understand well what it means by eligible warps or warp cycles per issued instruction (in unrolled version each warp only did some instruction once per 114 clock cycles of GPU in average?)
Base is without unrolling, main values - with. So e.g. with unrolling we have 7.88 active warps per scheduler, without - 7.96 (isn't the more - the better?)
Full code (look at __global__ void gpuReadMemory):
#include <assert.h>
#include <conio.h>
#include <stdio.h>
#include <cuda.h>
#define VC_EXTRALEAN
#include <windows.h>
#define CUDA_CHECK(err) __cudaSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
if (err != cudaSuccess)
{
fprintf(stderr, "%s(%i): CUDA error %d (%s)\n",
file, line, int(err), cudaGetErrorString(err));
throw "CUDA error";
}
}
int getMultiprocessorCount()
{
int num;
CUDA_CHECK(cudaDeviceGetAttribute(&num, cudaDevAttrMultiProcessorCount, 0));
return num;
}
__global__ void gpuWriteMemory(int *gpuArr, int dataSizeInMBs,
int packetShift, int passCount, int *gpuDebugArr)
{
int *pStart = gpuArr + ((long long)packetShift *
(blockDim.y * blockIdx.x + threadIdx.y)) / sizeof(int) + threadIdx.x;
int *pEnd = gpuArr + (((long long)dataSizeInMBs) << 20) / sizeof(int);
int shift = gridDim.x * blockDim.y * packetShift / sizeof(int);
for (int passInd = 0; passInd < passCount; passInd++)
for (int *p = pStart; p < pEnd; p += shift)
*p = blockIdx.x * 10000000 + threadIdx.y * 1000 + threadIdx.x;
}
__global__ void gpuReadMemory(int *gpuArr, int dataSizeInMBs,
int packetShift, int passCount, int *gpuDebugArr)
{
int *pStart = gpuArr + ((long long)packetShift *
(blockDim.y * blockIdx.x + threadIdx.y)) / sizeof(int) + threadIdx.x;
int *pEnd = gpuArr + (((long long)dataSizeInMBs) << 20) / sizeof(int);
int shift = gridDim.x * blockDim.y * packetShift / sizeof(int);
int sum = 0;
int accessCount = 0;
for (int passInd = 0; passInd < passCount; passInd++)
{
#pragma unroll // - doesn't have effect
for (int *p = pStart; p < pEnd; p += shift)
{
sum += *p;
p += shift; sum += *p;
p += shift; sum += *p;
p += shift; sum += *p;
}
*pStart = sum; // Without it bandwidth reported is 3 times bigger than theoretical
}
// Suspiciously fast code:
//for (int passInd = 0; passInd < passCount; passInd++)
// 0x0000000500999d10 IADD3 R8, R8, 0x1, RZ
// 0x0000000500999d20 BSSY B0, 0x500999de0
// for (int *p = pStart; p < pEnd; p += shift)
// 0x0000000500999d30 ISETP.GE.AND.EX P0, PT, R7, UR6, PT, P0
// for (int passInd = 0; passInd < passCount; passInd++)
// 0x0000000500999d40 ISETP.GE.AND P1, PT, R8, c[0x0][0x170], PT
// for (int *p = pStart; p < pEnd; p += shift)
// 0x0000000500999d50 #P0 BRA 0x500999dd0
// 0x0000000500999d60 IMAD.MOV.U32 R3, RZ, RZ, R2
// 0x0000000500999d70 IMAD.MOV.U32 R5, RZ, RZ, R4
// 0x0000000500999d80 LEA R3, P0, R0, R3, 0x2
// 0x0000000500999d90 LEA.HI.X R5, R0, R5, RZ, 0x2, P0
// 0x0000000500999da0 ISETP.GE.U32.AND P0, PT, R3, UR4, PT
// 0x0000000500999db0 ISETP.GE.U32.AND.EX P0, PT, R5, UR5, PT, P0
// 0x0000000500999dc0 #!P0 BRA 0x500999d80
// 0x0000000500999dd0 BSYNC B0
// Normal code:
//for (int *p = pStart; p < pEnd; p += shift)
// 0x0000000500999da0 IMAD.MOV.U32 R8, RZ, RZ, R6
// sum += *p;
//0x0000000500999db0 IMAD.MOV.U32 R4, RZ, RZ, R8
// 0x0000000500999dc0 LDG.E.SYS R4, [R4] // Reading from memory
// for (int *p = pStart; p < pEnd; p += shift)
// 0x0000000500999dd0 IADD3 R11, P0, R11, UR5, RZ
// 0x0000000500999de0 IADD3 R8, P1, R8, UR5, RZ
// 0x0000000500999df0 IADD3.X R13, R13, UR4, RZ, P0, !PT
// 0x0000000500999e00 ISETP.GE.U32.AND P0, PT, R11, R2, PT
// 0x0000000500999e10 IADD3.X R5, R5, UR4, RZ, P1, !PT
// 0x0000000500999e20 ISETP.GE.U32.AND.EX P0, PT, R13, R3, PT, P0
// sum += *p;
//0x0000000500999e30 IMAD.IADD R9, R4, 0x1, R9
// for (int *p = pStart; p < pEnd; p += shift)
// 0x0000000500999e40 #!P0 BRA 0x500999db0
// 0x0000000500999e50 BSYNC B0
// *pStart = sum; // Lowers bandwidth being reported on GT 520M from 33 to 9.7 GB/s
//0x0000000500999e60 STG.E.SYS [R6], R9
}
class CMemorySpeedTester
{
public:
CMemorySpeedTester()
{
m_multiprocessorCount = getMultiprocessorCount();
CUDA_CHECK(cudaMalloc((void**)&gpuArr, ((long long)dataSizeInMBs) << 20));
CUDA_CHECK(cudaMalloc((void**)&gpuDebugArr, debugArrLen * sizeof(int)));
CUDA_CHECK(cudaMemset(gpuArr, -1, ((long long)dataSizeInMBs) << 20));
debugArr = (int*)malloc(debugArrLen * sizeof(int));
debugArr2D = (int (*)[1000])debugArr;
QueryPerformanceFrequency(&timerFreq);
}
void testBandwidth()
{
int threadPerBlock = 256;
int blockCount = m_multiprocessorCount * 24;
dim3 blocks(blockCount);
dim3 threads1(32, threadPerBlock / 32);
gpuWriteMemory<<<blocks, threads1>>>(gpuArr, dataSizeInMBs,
32 * sizeof(int), 1, gpuDebugArr);
CUDA_CHECK(cudaDeviceSynchronize());
for (int passCount = 10; passCount <= 100; passCount *= 10)
{
int threadPerPacket = 32;
for (int packetShiftMult = 1; packetShiftMult <= 16; packetShiftMult *= 16)
{
int packetShift = threadPerPacket * sizeof(int) * packetShiftMult;
dim3 threads(threadPerPacket, threadPerBlock / threadPerPacket);
QueryPerformanceCounter(&t0);
gpuReadMemory<<<blocks, threads>>>(gpuArr, dataSizeInMBs,
packetShift, passCount, gpuDebugArr);
CUDA_CHECK(cudaDeviceSynchronize());
QueryPerformanceCounter(&t);
double dt = double(t.QuadPart - t0.QuadPart) / timerFreq.QuadPart;
printf(" %2d th./packet, packet shift %4d, %d pass(es): %.3f ms, %.2f GB/s\n",
threadPerPacket, packetShift, passCount,
dt * 1000, (double)(dataSizeInMBs) / packetShiftMult / (1 << 10) * passCount / dt);
}
}
}
void copyToHostDebugArr()
{
CUDA_CHECK(cudaMemcpy(debugArr, gpuDebugArr, debugArrLen * sizeof(int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
}
protected:
static const int dataSizeInMBs = 800;
static const int debugArrLen = 4000000;
int m_multiprocessorCount;
int *gpuArr, *gpuDebugArr;
int *debugArr;
int (*debugArr2D)[1000];
LARGE_INTEGER timerFreq, t, t0;
};
int main(int argc, char **argv)
{
printf("Started\n");
CMemorySpeedTester runner;
for (int runInd = 0; runInd < 5; runInd++)
{
printf("%d.\n", runInd);
runner.testBandwidth();
}
printf("Finished. Press Enter...");
getch();
}
Windows 10 x64, Visual Studio 2019, CUDA Toolkit 10.2, also can be compiled and reproduced on GeForce GT 520M, Windows 7 x64, Visual Studio 2010 and CUDA 7.5.
So I'm currently trying to write a kernel in OpenCL with the goal of sum reducing each row of a matrix (g_idata) into an array (g_odata). Said matrix is represented by a float array with column_count * row_count length, and the resulting array has a length of row_count. As such I've implemented the following kernel:
#define T float
#define Operation(X, Y) ((X) + (Y))
__kernel void marrow_kernel( __global T *g_odata,__global T *g_idata,
const unsigned long column_count, const unsigned long row_count, __local volatile T* sdata) {
size_t tid = get_local_id(0);
size_t gid = get_global_id(0);
size_t row = gid / column_count;
size_t column = gid % column_count;
if(row < row_count && column < column_count)
{
sdata[tid] = g_idata[gid];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(row < row_count && column < column_count)
{
size_t step = column_count / 2;
size_t limit = column_count;
while(step > 0)
{
if(column + step < limit) {
if(tid + step < get_local_size(0))
{
sdata[tid] = Operation(sdata[tid], sdata[tid + step]);
}
else if (gid + step < column_count * row_count)
{
sdata[tid] = Operation(sdata[tid], g_idata[gid + step]);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
step /= 2;
limit /= 2;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(row < row_count && column == 0)
{
g_odata[row] = column_count % 2 == 0 ? sdata[tid] : sdata[tid] + g_idata[gid + (column_count - 1)];
}
}
Said kernel is currently being instantiated with a work-group of 128 work-units. I currently have no control over the size of the work-group.
Now here's the issue: If lets say I've a row that's shared between two different work-groups, it'll return the wrong result, since it'll fetch the value in the g_idata, since it's impossible to access the result of the next work-group local memory. After the first iteration, that's the wrong value, and it'll afect the final result of the operation.
Anyone can give me an hint on how to solve this problem?
I have a quadratic optimization problem with linear constraints that I want to solve using SCIP. The optimization matrix that I want to be minimized is positive semi-definite (it is the variance of certain variables, to be precise). I have the problem in a file in CPLEX LP format and when I optimize in SCIP, I get the message
Quadratic constraint handler does not have LAPACK for eigenvalue computation. Will assume
that matrices (with size > 2x2) are indefinite.
So SCIP starts optimization assuming that the matrix is indefinite and takes a large amount of time. I have installed LAPACK and even copied liblapack.a file in the lib folder where the SCIP source and binaries are and reinstalled SCIP. But, I keep getting the above message.
Is there a way to make SCIP use the LAPACK library? I believe the optimization will be really fast, if SCIP can figure out that the matrix is positive semi-definite.
If you feel like patching up SCIP a bit to use your Lapack lib without providing a full Ipopt (though it's relatively easy to build on *nix and could help performance, as mattmilten pointed out), here is a patch that you could try out:
diff --git a/src/scip/cons_quadratic.c b/src/scip/cons_quadratic.c
index 93ba359..795bade 100644
--- a/src/scip/cons_quadratic.c
+++ b/src/scip/cons_quadratic.c
## -46,7 +46,7 ##
#include "scip/heur_trysol.h"
#include "scip/debug.h"
#include "nlpi/nlpi.h"
-#include "nlpi/nlpi_ipopt.h"
+/*#include "nlpi/nlpi_ipopt.h" */
/* constraint handler properties */
#define CONSHDLR_NAME "quadratic"
## -4257,6 +4257,71 ## void checkCurvatureEasy(
*determined = FALSE;
}
+#define F77_FUNC(a,A) a##_
+
+ /** LAPACK Fortran subroutine DSYEV */
+ void F77_FUNC(dsyev,DSYEV)(
+ char* jobz, /**< 'N' to compute eigenvalues only, 'V' to compute eigenvalues and eigenvectors */
+ char* uplo, /**< 'U' if upper triangle of A is stored, 'L' if lower triangle of A is stored */
+ int* n, /**< dimension */
+ double* A, /**< matrix A on entry; orthonormal eigenvectors on exit, if jobz == 'V' and info == 0; if jobz == 'N', then the matrix data is destroyed */
+ int* ldA, /**< leading dimension, probably equal to n */
+ double* W, /**< buffer for the eigenvalues in ascending order */
+ double* WORK, /**< workspace array */
+ int* LWORK, /**< length of WORK; if LWORK = -1, then the optimal workspace size is calculated and returned in WORK(1) */
+ int* info /**< == 0: successful exit; < 0: illegal argument at given position; > 0: failed to converge */
+ );
+
+/** Calls Lapacks Dsyev routine to compute eigenvalues and eigenvectors of a dense matrix.
+ */
+static
+SCIP_RETCODE LapackDsyev(
+ SCIP_Bool computeeigenvectors,/**< should also eigenvectors should be computed ? */
+ int N, /**< dimension */
+ SCIP_Real* a, /**< matrix data on input (size N*N); eigenvectors on output if computeeigenvectors == TRUE */
+ SCIP_Real* w /**< buffer to store eigenvalues (size N) */
+ )
+{
+ int INFO;
+ char JOBZ = computeeigenvectors ? 'V' : 'N';
+ char UPLO = 'L';
+ int LDA = N;
+ double* WORK = NULL;
+ int LWORK;
+ double WORK_PROBE;
+ int i;
+
+ /* First we find out how large LWORK should be */
+ LWORK = -1;
+ F77_FUNC(dsyev,DSYEV)(&JOBZ, &UPLO, &N, a, &LDA, w, &WORK_PROBE, &LWORK, &INFO);
+ if( INFO != 0 )
+ {
+ SCIPerrorMessage("There was an error when calling DSYEV. INFO = %d\n", INFO);
+ return SCIP_ERROR;
+ }
+
+ LWORK = (int) WORK_PROBE;
+ assert(LWORK > 0);
+
+ SCIP_ALLOC( BMSallocMemoryArray(&WORK, LWORK) );
+
+ for( i = 0; i < LWORK; ++i )
+ WORK[i] = i;
+
+ F77_FUNC(dsyev,DSYEV)(&JOBZ, &UPLO, &N, a, &LDA, w, WORK, &LWORK, &INFO);
+
+ BMSfreeMemoryArray(&WORK);
+
+ if( INFO != 0 )
+ {
+ SCIPerrorMessage("There was an error when calling DSYEV. INFO = %d\n", INFO);
+ return SCIP_ERROR;
+ }
+
+ return SCIP_OKAY;
+}
+
+
/** checks a quadratic constraint for convexity and/or concavity */
static
SCIP_RETCODE checkCurvature(
## -4343,7 +4408,7 ## SCIP_RETCODE checkCurvature(
return SCIP_OKAY;
}
- if( SCIPisIpoptAvailableIpopt() )
+ if( TRUE )
{
for( i = 0; i < consdata->nbilinterms; ++i )
{
## -4479,7 +4544,7 ## SCIP_RETCODE checkFactorable(
return SCIP_OKAY;
/* need routine to compute eigenvalues/eigenvectors */
- if( !SCIPisIpoptAvailableIpopt() )
+ if( !TRUE )
return SCIP_OKAY;
SCIP_CALL( consdataSortQuadVarTerms(scip, consdata) );
## -9395,7 +9460,7 ## SCIP_DECL_CONSINITSOL(consInitsolQuadratic)
SCIP_CALL( SCIPcatchEvent(scip, SCIP_EVENTTYPE_SOLFOUND, eventhdlr, (SCIP_EVENTDATA*)conshdlr, &conshdlrdata->newsoleventfilterpos) );
}
- if( nconss != 0 && !SCIPisIpoptAvailableIpopt() && !SCIPisInRestart(scip) )
+ if( nconss != 0 && !TRUE && !SCIPisInRestart(scip) )
{
SCIPverbMessage(scip, SCIP_VERBLEVEL_HIGH, NULL, "Quadratic constraint handler does not have LAPACK for eigenvalue computation. Will assume that matrices (with size > 2x2) are indefinite.\n");
}
Use USRLDFLAGS="-llapack -lblas" with make.
Currently, SCIP is only able to use LAPACK through Ipopt. There is usually a better performance on nonlinear problems when SCIP is compiled with Ipopt support, so it is definitely recommended. Run
make IPOPT=true
and make sure you have Ipopt installed beforehand.
I am trying to optimize a function (say find the minimum) with n parameters (Xn). All Xi's are bound to a certain range (for example -200 to 200) and if any parameter leaves this range, the function goes to infinity very fast. However, n can be large (from 20 to about 60-70) and computing it's value takes long time.
I don't think the details about the function are of big relevance, but here are some: it consists of a weighted sum of 20-30 smaller functions (all different), which on their part consist of sums of dot products under the sign of an inverse sinusoidal function (arcsin, arccos, arctan, etc). Something like arcsin(X1 . X2) + arcsin(X4 . X7) + ....
The function has many local minima in general, so approaches such as (naive) conjugated gradients or quasi-Newton are useless. Searching the entire domain brute force is too slow.
My initial idea was to use some sort of massive parallelization in combination with a genetic algorithm, which performs many searches on different spots in the domain of the function, and at regular intervals checks whether some of the searches reached local minima. If yes, it compares them and discards all results but the smallest one, and continues the search until a reasonably small value is found.
My two questions are:
1) Is it possible to implement this problem in CUDA or a similar technology? Can CUDA compute the value of a function like this fast enough?
2) Would be better/faster to implement the problem on a multicore PC (with 12+ cores)?
It is certainly possible to implement global optimization algorithms on GPUs, but you may have to modify the algorithms to get the best performance. I have personally implemented about a half dozen population-based metaheuristics on GPUs, and it is possible to get excellent speedups relative to a multi-threaded CPU.
With population-based algorithms iterated over generations, you may find that the population size becomes the limiting factor in your performance. If your objective function is not easily parallelizable, then the maximum number of parallel threads is generally the number of candidate solutions in the population. GPUs work best with, at minimum, tens of thousands of simultaneous threads, which is not always practical for population-based algorithms due to population inertia and other effects.
You can get around the population limitations to some extent by running multiple instances of the optimization in parallel, each starting with a different random seed. Further, you can arrange for the parallel instances to communicate over some network topology (this would be an island model algorithm), so that the parallel instances can collaboratively evolve solutions to complex problems. I have actually implemented this in OpenCL for my MScE thesis.
So overall, here are succinct answers to your questions:
1) Yes, you can implement this in CUDA or a similar technology. Your speedup will depend on how parallelizable your objective function is and how many parallel instances you want to run at the same time.
2) It would almost certainly be faster to implement on a CPU, due to a wider range of existing libraries and the fact that CPU programming models are conceptually simpler than GPU models. Whether or not it is "better" depends on what you value.
This is still an area of active research, and it will probably take you longer to build a working GPU implementation than it would take for a multicore CPU implementation. If implementation time is your primary concern, I would like to recommend that you take a look at the PaGMO project (and its Python bindings PyGMO), which is an excellent implementation of an island model optimizer with a wide range of local and global optimization functions included. The islands can be assigned any arbitrary algorithm to run independently, and you can specify exactly how they communicate to collaboratively optimize your objective function.
http://sourceforge.net/apps/mediawiki/pagmo/index.php?title=Main_Page
Your question is well within my research area, and I would be happy to help you further if you need it.
Can CUDA compute the value of a function like this fast enough?
Many cost functionals are expressed in the form of summations of a certain number of terms. Examples are the:
Sphere function
Rosenbrock function
Styblinski-Tang function
In all those cases, the evaluation of the cost function can be performed by a reduction, or better, a transformation followed by a reduction.
CUDA Thrust has thrust::transform_reduce which can surely serve the scope, but of course you can set up your own transformation + reduction routines.
Below, I'm providing an example on how you can compute the Rosenbrock functional using either CUDA Thrust or a customized version of the reduction routine offered by the CUDA examples. In the latter case, a pointer to a __device__ transformation function is passed to the customized transform_reduce function, if the EXTERNAL keyword is defined, or the the transformation function is defined and compiled in the compilation unit of the customized transform_reduce routine.
Some performance results on a Kepler K20c card for the non-EXTERNAL case:
N = 90000 Thrust = 0.055ms Customized = 0.059ms
N = 900000 Thrust = 0.67ms Customized = 0.14ms
N = 9000000 Thrust = 0.85ms Customized = 0.87ms
Here is the code. For the timing functions, please see OrangeOwlSolutions/Timing and OrangeOwlSolutions/CUDA_Utilities github projects.
Please, note that separate compilation is required.
kernel.cu
// --- Requires separate compilation
#include <stdio.h>
#include <thrust/device_vector.h>
#include "transform_reduce.cuh"
#include "Utilities.cuh"
#include "TimingCPU.h"
#include "TimingGPU.cuh"
/***************************************/
/* COST FUNCTION - GPU/CUSTOMIZED CASE */
/***************************************/
// --- Transformation function
__device__ __forceinline__ float transformation(const float * __restrict__ x, const int i) { return (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)) ; }
// --- Device-side function pointer
__device__ pointFunction_t dev_pfunc = transformation;
/***********************************/
/* COST FUNCTION - GPU/THRUST CASE */
/***********************************/
struct CostFunctionStructGPU{
template <typename Tuple>
__host__ __device__ float operator()(Tuple a) {
float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a));
float temp2 = (thrust::get<0>(a) - 1.f);
return 100.f * temp1 * temp1 + temp2 * temp2;
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 90000000;
float *x = (float *)malloc(N * sizeof(float));
for (int i=0; i<N; i++) x[i] = 3.f;
float *d_x; gpuErrchk(cudaMalloc((void**)&d_x, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice));
/************************************************/
/* "CUSTOMIZED" DEVICE-SIDE TRANSFORM REDUCTION */
/************************************************/
float customizedDeviceResult = transform_reduce(d_x, N - 1, &dev_pfunc);
TimingGPU timerGPU;
timerGPU.StartCounter();
customizedDeviceResult = transform_reduce(d_x, N - 1, &dev_pfunc);
printf("Timing for 'customized', device-side transform reduction = %f\n", timerGPU.GetCounter());
printf("Result for 'customized', device-side transform reduction = %f\n", customizedDeviceResult);
printf("\n\n");
/************************************************/
/* THRUST-BASED DEVICE-SIDE TRANSFORM REDUCTION */
/************************************************/
thrust::device_vector<float> d_vec(N,3.f);
timerGPU.StartCounter();
float ThrustResult = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(d_vec.begin(), d_vec.begin() + 1)), thrust::make_zip_iterator(thrust::make_tuple(d_vec.begin() + N - 1, d_vec.begin() + N)), CostFunctionStructGPU(), 0.f, thrust::plus<float>());
printf("Timing for Thrust-based, device-side transform reduction = %f\n", timerGPU.GetCounter());
printf("Result for Thrust-based, device-side transform reduction = %f\n", ThrustResult);
printf("\n\n");
/*********************************/
/* HOST-SIDE TRANSFORM REDUCTION */
/*********************************/
// thrust::host_vector<float> h_vec(d_vec);
//sum_host = sum_host + transformation(thrust::raw_pointer_cast(h_vec.data()), i);
TimingCPU timerCPU;
timerCPU.StartCounter();
float sum_host = 0.f;
for (int i=0; i<N-1; i++) {
float temp = (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f));
sum_host = sum_host + temp;
//printf("%i %f %f\n", i, temp, sum_host);
}
printf("Timing for host-side transform reduction = %f\n", timerCPU.GetCounter());
printf("Result for host-side transform reduction = %f\n", sum_host);
printf("\n\n");
sum_host = 0.f;
float c = 0.f;
for (int i=0; i<N-1; i++) {
float temp = (100.f * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - 1.f) * (x[i] - 1.f)) - c;
float t = sum_host + temp;
c = (t - sum_host) - temp;
sum_host = t;
}
printf("Result for host-side transform reduction = %f\n", sum_host);
// cudaDeviceReset();
}
transform_reduce.cuh
#ifndef TRANSFORM_REDUCE_CUH
#define TRANSFORM_REDUCE_CUH
// --- Function pointer type
// --- Complete with your own favourite instantiations
typedef float(*pointFunction_t)(const float * __restrict__, const int);
template <class T> T transform_reduce(T *, unsigned int, pointFunction_t *);
#endif
transform_reduce.cu
#include <stdio.h>
#include "Utilities.cuh"
#include "transform_reduce.cuh"
#define BLOCKSIZE 512
#define warpSize 32
// --- Host-side function pointer
pointFunction_t h_pfunc;
// --- Uncomment if you want to apply CUDA error checking to the kernel launches
//#define DEBUG
//#define EXTERNAL
/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
/*************************************/
/* CHECK IF A NUMBER IS A POWER OF 2 */
/*************************************/
// --- Note: although x = 1 is a power of 2 (1 = 2^0), this routine returns 0 for x == 1.
bool isPow2(unsigned int x) {
if (x == 1) return 0;
else return ((x&(x-1))==0);
}
/***************************/
/* TRANSFORMATION FUNCTION */
/***************************/
template <class T>
__host__ __device__ __forceinline__ T transformation(const T * __restrict__ x, const int i) { return ((T)100 * (x[i+1] - x[i] * x[i]) * (x[i+1] - x[i] * x[i]) + (x[i] - (T)1) * (x[i] - (T)1)) ; }
/********************/
/* REDUCTION KERNEL */
/********************/
/*
This version adds multiple elements per thread sequentially. This reduces the overall
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
(Brent's Theorem optimization)
Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
If blockSize > 32, allocate blockSize*sizeof(T) bytes.
*/
template <class T, unsigned int blockSize, bool nIsPow2>
__global__ void reductionKernel(T *g_idata, T *g_odata, unsigned int N, pointFunction_t pPointTransformation)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // Local thread index
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension
unsigned int gridSize = blockSize*2*gridDim.x;
// --- Performs the first level of reduction in registers when reading from global memory on multiple elements per thread.
// More blocks will result in a larger gridSize and therefore fewer elements per thread
T mySum = 0;
while (i < N) {
#ifdef EXTERNAL
mySum += (*pPointTransformation)(g_idata, i);
#else
mySum += transformation(g_idata, i);
#endif
// --- Ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < N)
#ifdef EXTERNAL
mySum += (*pPointTransformation)(g_idata, i+blockSize);
#else
mySum += transformation(g_idata, i+blockSize);
#endif
i += gridSize; }
// --- Each thread puts its local sum into shared memory
sdata[tid] = mySum;
__syncthreads();
// --- Reduction in shared memory. Fully unrolled loop.
if ((blockSize >= 512) && (tid < 256)) sdata[tid] = mySum = mySum + sdata[tid + 256];
__syncthreads();
if ((blockSize >= 256) && (tid < 128)) sdata[tid] = mySum = mySum + sdata[tid + 128];
__syncthreads();
if ((blockSize >= 128) && (tid < 64)) sdata[tid] = mySum = mySum + sdata[tid + 64];
__syncthreads();
#if (__CUDA_ARCH__ >= 300 )
// --- Single warp reduction by shuffle operations
if ( tid < 32 )
{
// --- Last iteration removed from the for loop, but needed for shuffle reduction
mySum += sdata[tid + 32];
// --- Reduce final warp using shuffle
for (int offset = warpSize/2; offset > 0; offset /= 2) mySum += __shfl_down(mySum, offset);
//for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor(mySum, i);
}
#else
// --- Reduction within a single warp. Fully unrolled loop.
if ((blockSize >= 64) && (tid < 32)) sdata[tid] = mySum = mySum + sdata[tid + 32];
__syncthreads();
if ((blockSize >= 32) && (tid < 16)) sdata[tid] = mySum = mySum + sdata[tid + 16];
__syncthreads();
if ((blockSize >= 16) && (tid < 8)) sdata[tid] = mySum = mySum + sdata[tid + 8];
__syncthreads();
if ((blockSize >= 8) && (tid < 4)) sdata[tid] = mySum = mySum + sdata[tid + 4];
__syncthreads();
if ((blockSize >= 4) && (tid < 2)) sdata[tid] = mySum = mySum + sdata[tid + 2];
__syncthreads();
if ((blockSize >= 2) && ( tid < 1)) sdata[tid] = mySum = mySum + sdata[tid + 1];
__syncthreads();
#endif
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
/******************************/
/* REDUCTION WRAPPER FUNCTION */
/******************************/
template <class T>
T transform_reduce_inner(T *g_idata, unsigned int N, pointFunction_t h_pfunc) {
// --- Reduction parameters
const int NumThreads = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
const int NumBlocks = (N + NumThreads - 1) / NumThreads;
const int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(T) : NumThreads * sizeof(T);
// --- Device memory space where storing the partial reduction results
T *g_odata; gpuErrchk(cudaMalloc((void**)&g_odata, NumBlocks * sizeof(T)));
if (isPow2(N)) {
switch (NumThreads) {
case 512: reductionKernel<T, 512, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 256: reductionKernel<T, 256, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 128: reductionKernel<T, 128, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 64: reductionKernel<T, 64, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 32: reductionKernel<T, 32, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 16: reductionKernel<T, 16, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 8: reductionKernel<T, 8, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 4: reductionKernel<T, 4, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 2: reductionKernel<T, 2, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 1: reductionKernel<T, 1, true><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
}
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
else {
switch (NumThreads) {
case 512: reductionKernel<T, 512, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 256: reductionKernel<T, 256, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 128: reductionKernel<T, 128, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 64: reductionKernel<T, 64, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 32: reductionKernel<T, 32, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 16: reductionKernel<T, 16, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 8: reductionKernel<T, 8, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 4: reductionKernel<T, 4, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 2: reductionKernel<T, 2, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
case 1: reductionKernel<T, 1, false><<< NumBlocks, NumThreads, smemSize>>>(g_idata, g_odata, N, h_pfunc); break;
}
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
T *host_vector = (T *)malloc(NumBlocks * sizeof(T));
gpuErrchk(cudaMemcpy(host_vector, g_odata, NumBlocks * sizeof(T), cudaMemcpyDeviceToHost));
T sum_transformReduce = (T)0;
for (int i=0; i<NumBlocks; i++) sum_transformReduce = sum_transformReduce + host_vector[i];
return sum_transformReduce;
}
template <class T>
T transform_reduce(T *g_idata, unsigned int N, pointFunction_t *dev_pfunc) {
#ifdef EXTERNAL
gpuErrchk(cudaMemcpyFromSymbol(&h_pfunc, *dev_pfunc, sizeof(pointFunction_t)));
#endif
T customizedDeviceResult = transform_reduce_inner(g_idata, N, h_pfunc);
return customizedDeviceResult;
}
// --- Complete with your own favourite instantiations
template float transform_reduce(float *, unsigned int, pointFunction_t *);
My answer above is mostly suited for problems having a very arge number of unknowns, which are typically dealt with local optimization algorithms. I will leave it here for possible reference to other users.
As you mentioned, you are dealing with 60-70 unknowns, a scenario which can be more easily managed by global optimization algorithms.
As underlined above, cost functionals often consist of summations, so that their computation amounts to subsequent transforms and reductions operations. With such a number of unknowns, reduction is shared memory could be an interesting option. Fortunately, CUB offers primitives for reduction in shared memory.
Here is a worked example on how using CUB for the calculation of a large number of cost functional values for problems having a moderate number of unknowns. The cost functional in this case is chosen to be the Rastrigin function, but the example can be adapted to other cost functionals by just changing the corresponding __device__ function.
#include <cub/cub.cuh>
#include <cuda.h>
#include "Utilities.cuh"
#include <iostream>
#define BLOCKSIZE 256
const int N = 4096;
/************************/
/* RASTRIGIN FUNCTIONAL */
/************************/
__device__ float rastrigin(float x) {
return x * x - 10.0f * cosf(2.0f * x) + 10.0f;
}
/******************************/
/* TRANSFORM REDUCTION KERNEL */
/******************************/
__global__ void CostFunctionalCalculation(const float * __restrict__ indata, float * __restrict__ outdata) {
unsigned int tid = threadIdx.x + blockIdx.x * gridDim.x;
// --- Specialize BlockReduce for type float.
typedef cub::BlockReduce<float, BLOCKSIZE> BlockReduceT;
__shared__ typename BlockReduceT::TempStorage temp_storage;
float result;
if(tid < N) result = BlockReduceT(temp_storage).Sum(rastrigin(indata[tid]));
if(threadIdx.x == 0) outdata[blockIdx.x] = result;
return;
}
/********/
/* MAIN */
/********/
int main() {
// --- Allocate host side space for
float *h_data = (float *)malloc(N * sizeof(float));
float *h_result = (float *)malloc((N / BLOCKSIZE) * sizeof(float));
float *d_data; gpuErrchk(cudaMalloc(&d_data, N * sizeof(float)));
float *d_result; gpuErrchk(cudaMalloc(&d_result, (N / BLOCKSIZE) * sizeof(float)));
for (int i = 0; i < N; i++) {
h_data[i] = 1.f;
}
gpuErrchk(cudaMemcpy(d_data, h_data, N * sizeof(float), cudaMemcpyHostToDevice));
CostFunctionalCalculation<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_data, d_result);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_result, d_result, (N / BLOCKSIZE) * sizeof(float), cudaMemcpyDeviceToHost));
std::cout << "output: \n";
for (int k = 0; k < N / BLOCKSIZE; k++) std::cout << k << " " << h_result[k] << "\n";
std::cout << std::endl;
gpuErrchk(cudaFree(d_data));
gpuErrchk(cudaFree(d_result));
return 0;
}
I want to calculate the product A^T*A ( A is 2000x1000 Matrix). Also i only want to solve the upper triangular Matrix. In the inner loop i have to solve the dot product of two vectors.
Now, here is the problem. Using cblas ddot() is not faster than calculating the dot product with a loop. How is this possible? (using Intel Core (TM)i7 CPU M620 #2,67GHz, 1,92GB RAM)
The problem is caused essentially by matrix size, not by ddot. Your matrices are so large that they do not fit in the cache memory. The solution is to rearrange the three nested loops such that as much as possible can be done with a line in cache, so reducing cache refreshes. A model implementation follows for both the ddot and an daxpy approach. On my computer the time consumption was about 15:1.
In other words: never, never, never program a matrix multiplication along the "row times column" scheme that we learned in school.
/*
Matrix product of A^T * A by two methods.
1) "Row times column" as we learned in school.
2) With rearranged loops such that need for cash refreshes is reduced
(this can be improved even more).
Compile: gcc -o aT_a aT_a.c -lgslcblas -lblas -lm
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cblas.h>
#define ROWS 2000
#define COLS 1000
static double a[ROWS][COLS];
static double c[COLS][COLS];
static void dot() {
int i, j;
double *ai, *bj;
ai = a[0];
for (i=0; i<COLS; i++) {
bj = a[0];
for (j=0; j<COLS; j++) {
c[i][j] = cblas_ddot(ROWS,ai,COLS,bj,COLS);
bj += 1;
}
ai += 1;
}
}
static void axpy() {
int i, j;
double *ci, *bj, aij;
for (i=0; i<COLS; i++) {
ci = c[i];
for (j=0; j<COLS; j++) ci[j] = 0.;
for (j=0; j<ROWS; j++) {
aij = a[j][i];
bj = a[j];
cblas_daxpy(COLS,aij,bj,1,ci,1);
}
}
}
int main(int argc, char** argv) {
clock_t t0, t1;
int i, j;
for (i=0; i<ROWS; ++i)
for (j=0; j<COLS; ++j)
a[i][j] = i+j;
t0 = clock();
dot();
t0 = clock();
printf("Time for DOT : %f sec.\n",(double)t0/CLOCKS_PER_SEC);
axpy();
t1 = clock();
printf("Time for AXPY: %f sec.\n",(double)(t1-t0)/CLOCKS_PER_SEC);
return 0;
}
The CBLAS dot product is effectively just a computation in slightly unrolled loop. The netlib Fortran is just this:
DO I = MP1,N,5
DTEMP = DTEMP + DX(I)*DY(I) + DX(I+1)*DY(I+1) +
$ DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4)
END DO
ie. just a loop unrolled to a stride of 5.
If you must use a ddot style dot product for your operation, you might get a performance boost by re-writing your loop to use SSE2 intrinsics:
#include <emmintrin.h>
double ddotsse2(const double *x, const double *y, const int n)
{
double result[2];
int n2 = 2 * (n/2);
__m128d dtemp;
if ( (n % 2) == 0) {
dtemp = _mm_setzero_pd();
} else {
dtemp = _mm_set_sd(x[n] * y[n]);
}
for(int i=0; i<n2; i+=2) {
__m128d x1 = _mm_loadr_pd(x+i);
__m128d y1 = _mm_loadr_pd(y+i);
__m128d xy = _mm_mul_pd(x1, y1);
dtemp = _mm_add_pd(dtemp, xy);
}
_mm_store_pd(&result[0],dtemp);
return result[0] + result[1];
}
(not tested, never been compiled, buyer beware).
This may or may be faster than the standard BLAS implementation. You may also want to investigate whether further loop unrolling could improve performance.
If you're not using SSE2 intrinsics or using a data type that may not boost performance with them, you can try to transpose the matrix for an easy improvement in performance for larger matrix multiplications with cblas_?dot. Performing the matrix multiplication in blocks also helps.
void matMulDotProduct(int n, float *A, float* B, int a_size, int b_size, int a_row, int a_col, int b_row, int b_col, float *C) {
int i, j, k;
MKL_INT incx, incy;
incx = 1;
incy = b_size;
//copy out multiplying matrix from larger matrix
float *temp = (float*) malloc(n * n * sizeof(float));
for (i = 0; i < n; ++i) {
cblas_scopy(n, &B[(b_row * b_size) + b_col + i], incy, &temp[i * n], 1);
}
//transpose
mkl_simatcopy('R', 'T', n, n, 1.0, temp, 1, 1);
for (i = 0; i < n; i+= BLOCK_SIZE) {
for (j = 0; j < n; j++) {
for (k = 0; k < BLOCK_SIZE; ++k) {
C[((i + k) * n) + j] = cblas_sdot(n, &A[(a_row + i + k) * a_size + a_col], incx, &temp[n * j], 1);
}
}
}
free(temp);
}
On my machine, this code is about 1 order of magnitude faster than the the 3 loop code (but also 1 order of magnitude slower than cblas_?gemm call) for single precision floats and 2K by 2K matrices. (I'm using Intel MKL).