OpenACC and CUDA aware MPI

OpenACC and CUDA aware MPI - gpu

I want to move on the device the whole while loop in the main. The problems emerges when I add #pragma acc host_data use_device(err) to MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);.
The error is that the reduction on err doesn't work so that the code exit after one step from the loop.
After the MPI_Allreduce(), even using #pragma acc update self(err), err is still equal to zero.
I'm compiling with mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c
And running with mpirun -np 2 -mca pml ^ucx ./a.out
Could you help me to find the error?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
err = err_glob;
#endif
// #pragma acc update host(err)
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}

Thanks for updating the example. There's a few issues here.
First, for "err" and "err_glob". At the beginning of the loop, you set "err=0" on the host but don't update it on the device. Then after the MPI_AllReduce call, you set "err=err_glob", again on the host, so need to update "err_glob".
The second issue is that the code is getting partially present errors for "y" when run with multiple ranks. The problem being you're using the global size not the local size for "x" and "y" so when you copy "y" it overlaps with "x" due to the offsets. I fixed this by copying "xg" and "yg" to the device instead.
As for performance relative to the CPU, the main problem here is that the size is small so the code severly under utilizes the GPU. I increased the GLOB sizes to 4096 and see better relative performance, though the code converges much faster.
I also took the liberty of adding some boiler plate code that I use for rank to device assignment so the code can take advantage of multiple GPUs.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
#ifdef _OPENACC
#include <openacc.h>
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
int xsize,ysize;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
#ifdef _OPENACC
/* -------------------------------------------------------
0. Set the device for each rank
------------------------------------------------------- */
int device_type, num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
// Get the local rank number
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
// Device num = local rank mod number of devices on the node
device_type = acc_get_device_type();
num_devices = acc_get_num_devices(device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, device_type);
acc_init(device_type);
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
//#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc update device(err)
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
#pragma acc update host(err_glob)
err = err_glob;
#endif
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}

Related

Sha256 opencl kernel needed [Help needed]

i need a Sha256 kernel file , i am using Cloo as my opencl library , it will be included in WPF project
i am calculating a hash value several times
the program needs about an 30 mins or so to do that but my search result claimed opencl will reduce that time to under 3 mins or less
thanks in advance
[Edit]
ok now i managed to do it using this
https://searchcode.com/file/45893396/src/opencl/sha256_kernel.cl/
but it works fine with string
yet when sending my byteArray header to hash it returned a very different value than expected
[Edit2]
it can not handle large arrays any array more than 32 length returns missy results

Found this and i modified it to calculate double hash
if anyone needs it
#ifndef uint8_t
#define uint8_t unsigned char
#endif
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#ifndef uint64_t
#define uint64_t unsigned long int
#endif
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
typedef struct
{
uint32_t state[8];
uint64_t count;
uint8_t buffer[64];
} CSha256;
inline void Sha256_Init(CSha256 *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch2(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define sha_a(i) T[(0-(i))&7]
#define sha_b(i) T[(1-(i))&7]
#define sha_c(i) T[(2-(i))&7]
#define sha_d(i) T[(3-(i))&7]
#define sha_e(i) T[(4-(i))&7]
#define sha_f(i) T[(5-(i))&7]
#define sha_g(i) T[(6-(i))&7]
#define sha_h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch2(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, i+1); \
R(g,h,a,b,c,d,e,f, i+2); \
R(f,g,h,a,b,c,d,e, i+3); \
R(e,f,g,h,a,b,c,d, i+4); \
R(d,e,f,g,h,a,b,c, i+5); \
R(c,d,e,f,g,h,a,b, i+6); \
R(b,c,d,e,f,g,h,a, i+7)
#else
#define R(i) sha_h(i) += S1(sha_e(i)) + Ch2(sha_e(i),sha_f(i),sha_g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
sha_d(i) += sha_h(i); sha_h(i) += S0(sha_a(i)) + Maj(sha_a(i), sha_b(i), sha_c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
inline static void Sha256_Transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16];
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
inline static void Sha256_WriteByteBlock(CSha256 *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
Sha256_Transform(p->state, data32);
}
inline void Sha256_Update(CSha256 *p, __global const uint8_t *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final(CSha256 *p, __global uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
inline void Sha256_Update1(CSha256 *p, const uint8_t *data, uint32_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final1(CSha256 *p, uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
__kernel void Sha256_1(__global uint8_t *header,__global uint8_t *toRet)
{
uint8_t tempHdr[80];
uint8_t tempDigest[32]={0};
uint startNon=toRet[0] + (toRet[1] << 8) + (toRet[2] << 16) + (toRet[3] << 24);
uint maxNon=toRet[4] + (toRet[5] << 8) + (toRet[6] << 16) + (toRet[7] << 24);
uint nonce =startNon;
uint32_t finalNon=0;
uint8_t match=0;
for(int x=0;x<80;x++)
tempHdr[x]=header[x];
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
while(finalNon<1)
{
CSha256 p;
Sha256_Init(&p);
Sha256_Update1(&p, tempHdr, 80);
Sha256_Final1(&p, tempDigest);
CSha256 p1;
Sha256_Init(&p1);
Sha256_Update1(&p1, tempDigest, 32);
Sha256_Final1(&p1, tempDigest);
for(int x=31;x>21;x--)
{
if(tempDigest[x]<1) match++;
}
if(match>8)
{
finalNon=nonce;
toRet[8] = (char)(nonce);
toRet[9] = (char)(nonce >> 8);
toRet[10] = (char)(nonce >> 16);
toRet[11] = (char)(nonce >> 24);
}
else
{
nonce++;
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
}
match=0;
if(nonce>maxNon) break;
if(nonce<=startNon) break;
}
}

How to not parallelize inner loops in OpenACC

I am a beginner in doing GPU programming with OpenACC. I was trying to do a direct convolution. Convolution consists of 6 nested loops. I only want the first loop to be parallelized. I gave the pragma #pragma acc loop for the first loop and #pragma acc loop seq for the rest. But the output that I am getting is not correct. Is the approach taken by me to parallelize the loop correct ? Specifications for the convolution: Input channels-3, Input Size- 224X224X3, Output channels- 64, Output Size- 111X111X64, filter size- 3X3X3X64. Following is the link to the header files dog.h and squeezenet_params.h. https://drive.google.com/drive/folders/1a9XRjBTrEFIorrLTPFHS4atBOPrG886i
# include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "squeezenet_params.h"
#include "dog.h"
void conv3x3(
const int input_channels, const int input_size,
const int pad, const int stride, const int start_channel,
const int output_size, const float* restrict input_im, const float* restrict filter_weight,
const float* restrict filter_bias, float* restrict output_im){
#pragma acc data copyin (input_im[0:150527],filter_weight[0:1727],filter_bias[0:63]) copyout(output_im[0:788543])
{
#pragma acc parallel
{
#pragma acc loop
for(int p=0;p<64;++p){
filter_weight += p * input_channels * 9;
float bias = filter_bias[p];
output_im += (start_channel + p) * output_size * output_size;
//loop over output feature map
#pragma acc loop seq
for(int i = 0; i < output_size; i++)
{
#pragma acc loop seq
for(int j = 0; j < output_size; j++)
{
//compute one element in the output feature map
float tmp = bias;
//compute dot product of 2 input_channels x 3 x 3 matrix
#pragma acc loop seq
for(int k = 0; k < input_channels; k++)
{
#pragma acc loop seq
for(int l = 0; l < 3; l++)
{
int h = i * stride + l - pad;
#pragma acc loop seq
for(int m = 0; m < 3; m++)
{
int w = j * stride + m - pad;
if((h >= 0) && (h < input_size) && (w >= 0) && (w < input_size))
{
tmp += input_im[k * input_size * input_size + (i * stride + l - pad) * input_size + j * stride + m - pad] \
* filter_weight[9 * k + 3 * l + m];
}
}
}
}
//add relu activation after conv
output_im[i * output_size + j] = (tmp > 0.0) ? tmp : 0.0;
}
}
}
}
}
}
void main(){
float * result = (float*)malloc(sizeof(float) * (1 * 64 * 111 * 111));
conv3x3(3,224,0,2,0,111,sample,conv1_weight,conv1_bias,result);
for(int i=0;i<64 * 111 * 111;++i){
//if(result[i]>0)
printf("%f:%d\n",result[i],i);
}
}

The contributor posted the same question on the PGI User Forums where I've answered. (See: https://www.pgroup.com/userforum/viewtopic.php?f=4&t=7614). The topic question is incorrect in that the inner loops are not getting parallelized nor are the cause of the issue.
The problem here is that the code has a race condition on the shared "output_im" pointer. My suggested solution is to compute a per thread offset into the array rather than trying to manipulate the pointer itself.
for(int p=0;p<64;++p){
filter_weight += p * input_channels * 9;
float bias = filter_bias[p];
int offset;
offset = (start_channel + p) * output_size * output_size;
//loop over output feature map
#pragma acc loop vector collapse(2)
for(int i = 0; i < output_size; i++)
{
for(int j = 0; j < output_size; j++)
{
... cut ...
}
}
//add relu activation after conv
int idx = offset + (i * output_size + j);
output_im[idx] = (tmp > 0.0) ? tmp : 0.0;
}
}

Matrices multiply, Cannon algorithm implementation using MPI

First of all, of course I saw similar questions and solutions, but my implementation is a little bit different.
The main problem is that, the my code works only for one process, but it doesn't work for more processes.
I don't know what is the cause of this... Probably in communication between processes but I can't figure it out ;/
#include <mpi.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
using namespace std;
int main(int argc, char **argv)
{
int x = 0;
double kk;
int proces;
int numprocs;
int right_neigh, left_neigh, up_neigh, down_neigh;
int tag = 99;
static const int n = 6; //size of matrices
int psa[n][n]; //nxn
int psb[n][n];
int pra[n][n];
int prb[n][n];
int c[n][n];
for (int i = 0; i < n; i++) { //let's make fist matrix
for (int j = 0; j < n; j++) {
psa[i][j] = (int)rand() % 100 + 1;
psb[i][j] = (int)rand() % 100 + 1;
c[i][j] = 0;
}
}
for (int i = 0; i < n; i++) { //an the 2nd one
for (int j = 0; j < n; j++) {
pra[i][j] = psa[i][j];
prb[i][j] = psb[i][j];
}
}
MPI_Status statRecv[2];
MPI_Request reqSend[2], reqRecv[2];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &proces);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
int PP = numprocs;
double np = numprocs;
kk = sqrt(np);
int k = (int)kk;
if (proces < k) // below neighbour set
{
left_neigh = (proces + k - 1) % k;
right_neigh = (proces + k + 1) % k;
up_neigh = ((k - 1)*k) + proces;
}
if (proces == k)
{
left_neigh = ((proces + k - 1) % k) + k;
right_neigh = ((proces + k + 1) % k) + k;
up_neigh = proces - k;
}
if (proces > k)
{
x = proces / k;
left_neigh = ((proces + k - 1) % k) + x * k;
right_neigh = ((proces + k + 1) % k) + x * k;
up_neigh = proces - k;
}
if (proces == 0 || (proces / k) < (k - 1))
{
down_neigh = proces + k;
}
if ((proces / k) == (k - 1))
{
down_neigh = proces - ((k - 1)*k);
}
x = 0;
for(int kk = 0; kk < PP; kk++) //algorithm
{
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
for (int k = 0; k < n / PP; k++)
{
c[i][j] += psa[i][k] * psb[k][j];
}
}
}
MPI_Irecv(pra, n*n / PP / PP,MPI_FLOAT,left_neigh, tag,MPI_COMM_WORLD, reqRecv);
MPI_Irecv(prb, n*n / PP / PP,MPI_FLOAT,down_neigh,tag,MPI_COMM_WORLD,&reqRecv[1]);
MPI_Isend(psa, n*n / PP / PP,MPI_FLOAT,right_neigh,tag,MPI_COMM_WORLD, reqSend);
MPI_Isend(psb, n*n / PP / PP,MPI_FLOAT,up_neigh,tag,MPI_COMM_WORLD,&reqSend[1]);
MPI_Wait(reqRecv, statRecv);
}
cout << "A" << endl; //show result
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cout << pra[i][j] << " ";
}
cout << endl;
}
cout << "B" << endl;
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cout << prb[i][j] << " ";
}
cout << endl;
}
cout << "C" << endl;
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cout << c[i][j] << " ";
}
cout << endl;
}
MPI_Finalize();
return 0;
}

Ok I made it. Now everything is cool, my friend helped me out. But Admin please do not remove it, it can be helpful for someone.
#include <mpi.h>
#include <stdio.h>
#include <math.h>
#include <iostream> using namespace std; int main(int argc, char **argv) { int x = 0; double kk; int proces; int numprocs; int prawy_sasiad, lewy_sasiad, gorny_sasiad, dolny_sasiad; int tag = 99;
static const int n = 4; //rozmiar tablic
const int PP = 2; // pierwiastek z liczby procesow
int A[n][n] = {}, B[n][n] = {};
for (int i = 0; i < n; i++) {//inicjalizacja macierzy glownych
for (int j = 0; j < n; j++) {
A[i][j] = (int)rand() % 100 + 1;
B[i][j] = (int)rand() % 100 + 1;
}
}
/*
int val = 1;
for (int i = 0; i < n; i++) { //inicjalizacja macierzy glownych
for (int j = 0; j < n; j++) {
A[i][j] = val;
B[i][j] = val;
val++;
}
}
*/
MPI_Status statRecv2;
MPI_Request reqSend2, reqRecv2;
MPI_Status statRecv[2];
MPI_Request reqSend[2], reqRecv[2];
MPI_Init(0, 0);
MPI_Comm_rank(MPI_COMM_WORLD, &proces);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
int pra[n / PP][n / PP] = {}, psa[n / PP][n / PP] = {};// podmacierze
int prb[n / PP][n / PP] = {}, psb[n / PP][n / PP] = {};
//int C[n / PP][n / PP] = {};//wynikowa
int C[n][n] = {};//wynikowa
//cout << proces << endl;
for (int i = 0; i < n / PP; i++)//podzielenie macierzy glownej na podmacierze, kazdy proces otrzymuje inna podmacierz
{
for (int j = 0; j < n / PP; j++)
{
psa[i][j] = A[proces / PP*(n / PP) + i][proces%PP*(n / PP) + j];
psb[i][j] = B[proces / PP*(n / PP) + i][proces%PP*(n / PP) + j];
//cout << A[proces / PP*(n / PP) + i][proces%PP*(n / PP) + j] << " ";
}
//cout << endl;
}
double np = numprocs;
kk = sqrt(np);
int k = (int)kk;
if (proces < k) // ustawienie sasiadow
{
lewy_sasiad = (proces + k - 1) % k;
prawy_sasiad = (proces + k + 1) % k;
gorny_sasiad = ((k - 1)*k) + proces;
}
if (proces == k)
{
lewy_sasiad = ((proces + k - 1) % k) + k;
prawy_sasiad = ((proces + k + 1) % k) + k;
gorny_sasiad = proces - k;
}
if (proces > k)
{
x = proces / k;
lewy_sasiad = ((proces + k - 1) % k) + x * k;
prawy_sasiad = ((proces + k + 1) % k) + x * k;
gorny_sasiad = proces - k;
}
if (proces == 0 || (proces / k) < (k - 1))
{
dolny_sasiad = proces + k;
}
if ((proces / k) == (k - 1))
{
dolny_sasiad = proces - ((k - 1)*k);
}
x = 0;
int p = 0;
do{ //przesuniecia
if (p < proces / PP)// w wierszu
{
MPI_Irecv(pra, n*n / PP / PP, MPI_FLOAT, prawy_sasiad, tag, MPI_COMM_WORLD, &reqRecv2);
MPI_Isend(psa, n*n / PP / PP, MPI_FLOAT, lewy_sasiad, tag, MPI_COMM_WORLD, &reqSend2);
MPI_Wait(&reqRecv2, &statRecv2);
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psa[i][j] = pra[i][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
if (p < proces % PP)// i w kolumnie
{
MPI_Irecv(prb, n*n / PP / PP, MPI_FLOAT, dolny_sasiad, tag, MPI_COMM_WORLD, &reqRecv2);
MPI_Isend(psb, n*n / PP / PP, MPI_FLOAT, gorny_sasiad, tag, MPI_COMM_WORLD, &reqSend2);
MPI_Wait(&reqRecv2, &statRecv2);
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psb[i][j] = prb[i][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
p++;
} while (p < n);
//MPI_Barrier(MPI_COMM_WORLD);
for (int kkk = 0; kkk < PP; kkk++) //algorytm
{
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
for (int k = 0; k < n / PP; k++)
{
C[i][j] += psa[i][k] * psb[k][j];
}
}
}
MPI_Irecv(pra, n*n / PP / PP, MPI_FLOAT, prawy_sasiad, tag, MPI_COMM_WORLD, reqRecv);
MPI_Irecv(prb, n*n / PP / PP, MPI_FLOAT, dolny_sasiad, tag, MPI_COMM_WORLD, &reqRecv[1]);
MPI_Isend(psa, n*n / PP / PP, MPI_FLOAT, lewy_sasiad, tag, MPI_COMM_WORLD, reqSend);
MPI_Isend(psb, n*n / PP / PP, MPI_FLOAT, gorny_sasiad, tag, MPI_COMM_WORLD, &reqSend[1]);
MPI_Wait(reqRecv, statRecv);
MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psa[i][j] = pra[i][j];
}
}
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psb[i][j] = prb[i][j];
}
}
}
cout << "Proces: " << proces << " ";
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
cout << C[i][j] << " ";
}
}
MPI_Finalize();
return 0;
}

OpenCL kernel doesn't finish executing

I am writing a simple monte carlo code for simulation of electron scattering. I ran the Kernel for 10 million electron and it runs fine, but when I increase the number of electrons to a higher number, say 50 million, the code just wouldn't finish and the computer freezes. I wanted to know if this is a hardware issue or if there is a possible bug in the code. I am running the code on a iMac with ATI Radeon HD 5870.
int rand_r (unsigned int seed)
{
unsigned int next = seed;
int result;
next *= 1103515245;
next += 12345;
result = (unsigned int) (next / 65536) % 2048;
next *= 1103515245;
next += 12345;
result <<= 10;
result ^= (unsigned int) (next / 65536) % 1024;
next *= 1103515245;
next += 12345;
result <<= 10;
result ^= (unsigned int) (next / 65536) % 1024;
seed = next;
return result;
}
__kernel void MC(const float E, __global float* bse, const int count) {
int tx, ty;
tx = get_global_id(0);
ty = get_global_id(1);
float RAND_MAX = 2147483647.0f;
int rand_seed;
int seed = count*ty + tx;
float rand;
float PI;
PI = 3.14159f;
float z;
z = 28.0f;
float rho;
rho = 8.908f;
float A;
A = 58.69f;
int num;
num = 10000000/(count*count);
int counter, counter1, counter2;
counter = 0;
float4 c_new, r_new;
float E_new, alpha, de_ds, phi, psi, mfp,sig_eNA,step, dsq, dsqi, absc0z;
float J;
J = (9.76f*z + 58.5f*powr(z,-0.19f))*1E-3f;
float4 r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
float2 tilt = (float2)((70.0f/180.0f)*PI , 0.0f);
float4 c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f);
for (int i = 0; i < num; ++i){
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //some random no. generator in gpu
r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f);
E_new = E;
c_new = c0;
alpha = (3.4E-3f)*powr(z,0.67f)/E_new;
sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4.0f*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f)));
mfp = A/(rho*sig_eNA);
step = -mfp * log(rand);
r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f);
r0 = r_new;
counter1 = 0;
counter2 = 0;
while (counter1 < 1000){
alpha = (3.4E-3f)*powr(z,0.67f)/E_new;
sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f)));
mfp = A/(rho*sig_eNA);
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //some random no. generator in gpu
step = -mfp * log(rand);
de_ds = -78500.0f*(z/(A*E_new)) * log((1.66f*(E_new + 0.85f*J))/J);
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //new random no.
phi = acos(1 - ((2*alpha*rand)/(1 + alpha - rand)));
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //third random no.
psi = 2*PI*rand;
if ((c0.z >= 0.999f) || (c0.z <= -0.999f) ){
absc0z = abs(c0.z);
c_new = (float4)(sin(phi) * cos(psi), sin(phi) * sin(psi), (c0.z/absc0z)*cos(phi), 0.0f);
}
else {
dsq = sqrt(1-c0.z*c0.z);
dsqi = 1/dsq;
c_new = (float4)(sin(phi)*(c0.x*c0.z*cos(psi) - c0.y*sin(psi))*dsqi + c0.x*cos(phi), sin(phi) * (c0.y * c0.z * cos(psi) + c0.x * sin(psi)) * dsqi + c0.y * cos(phi), -sin(phi) * cos(psi) * dsq + c0.z * cos(phi), 0.0f);
}
r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f);
r0 = r_new;
c0 = c_new;
E_new += step*rho*de_ds;
if (r0.z <= 0 && counter2 == 0){
counter++ ;
counter2 = 1;
}
counter1++ ;
}
}
bse[count*ty + tx] = counter;
}

dot product using cblas is slow

I want to calculate the product A^T*A ( A is 2000x1000 Matrix). Also i only want to solve the upper triangular Matrix. In the inner loop i have to solve the dot product of two vectors.
Now, here is the problem. Using cblas ddot() is not faster than calculating the dot product with a loop. How is this possible? (using Intel Core (TM)i7 CPU M620 #2,67GHz, 1,92GB RAM)

The problem is caused essentially by matrix size, not by ddot. Your matrices are so large that they do not fit in the cache memory. The solution is to rearrange the three nested loops such that as much as possible can be done with a line in cache, so reducing cache refreshes. A model implementation follows for both the ddot and an daxpy approach. On my computer the time consumption was about 15:1.
In other words: never, never, never program a matrix multiplication along the "row times column" scheme that we learned in school.
/*
Matrix product of A^T * A by two methods.
1) "Row times column" as we learned in school.
2) With rearranged loops such that need for cash refreshes is reduced
(this can be improved even more).
Compile: gcc -o aT_a aT_a.c -lgslcblas -lblas -lm
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cblas.h>
#define ROWS 2000
#define COLS 1000
static double a[ROWS][COLS];
static double c[COLS][COLS];
static void dot() {
int i, j;
double *ai, *bj;
ai = a[0];
for (i=0; i<COLS; i++) {
bj = a[0];
for (j=0; j<COLS; j++) {
c[i][j] = cblas_ddot(ROWS,ai,COLS,bj,COLS);
bj += 1;
}
ai += 1;
}
}
static void axpy() {
int i, j;
double *ci, *bj, aij;
for (i=0; i<COLS; i++) {
ci = c[i];
for (j=0; j<COLS; j++) ci[j] = 0.;
for (j=0; j<ROWS; j++) {
aij = a[j][i];
bj = a[j];
cblas_daxpy(COLS,aij,bj,1,ci,1);
}
}
}
int main(int argc, char** argv) {
clock_t t0, t1;
int i, j;
for (i=0; i<ROWS; ++i)
for (j=0; j<COLS; ++j)
a[i][j] = i+j;
t0 = clock();
dot();
t0 = clock();
printf("Time for DOT : %f sec.\n",(double)t0/CLOCKS_PER_SEC);
axpy();
t1 = clock();
printf("Time for AXPY: %f sec.\n",(double)(t1-t0)/CLOCKS_PER_SEC);
return 0;
}

The CBLAS dot product is effectively just a computation in slightly unrolled loop. The netlib Fortran is just this:
DO I = MP1,N,5
DTEMP = DTEMP + DX(I)*DY(I) + DX(I+1)*DY(I+1) +
$ DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4)
END DO
ie. just a loop unrolled to a stride of 5.
If you must use a ddot style dot product for your operation, you might get a performance boost by re-writing your loop to use SSE2 intrinsics:
#include <emmintrin.h>
double ddotsse2(const double *x, const double *y, const int n)
{
double result[2];
int n2 = 2 * (n/2);
__m128d dtemp;
if ( (n % 2) == 0) {
dtemp = _mm_setzero_pd();
} else {
dtemp = _mm_set_sd(x[n] * y[n]);
}
for(int i=0; i<n2; i+=2) {
__m128d x1 = _mm_loadr_pd(x+i);
__m128d y1 = _mm_loadr_pd(y+i);
__m128d xy = _mm_mul_pd(x1, y1);
dtemp = _mm_add_pd(dtemp, xy);
}
_mm_store_pd(&result[0],dtemp);
return result[0] + result[1];
}
(not tested, never been compiled, buyer beware).
This may or may be faster than the standard BLAS implementation. You may also want to investigate whether further loop unrolling could improve performance.

If you're not using SSE2 intrinsics or using a data type that may not boost performance with them, you can try to transpose the matrix for an easy improvement in performance for larger matrix multiplications with cblas_?dot. Performing the matrix multiplication in blocks also helps.
void matMulDotProduct(int n, float *A, float* B, int a_size, int b_size, int a_row, int a_col, int b_row, int b_col, float *C) {
int i, j, k;
MKL_INT incx, incy;
incx = 1;
incy = b_size;
//copy out multiplying matrix from larger matrix
float *temp = (float*) malloc(n * n * sizeof(float));
for (i = 0; i < n; ++i) {
cblas_scopy(n, &B[(b_row * b_size) + b_col + i], incy, &temp[i * n], 1);
}
//transpose
mkl_simatcopy('R', 'T', n, n, 1.0, temp, 1, 1);
for (i = 0; i < n; i+= BLOCK_SIZE) {
for (j = 0; j < n; j++) {
for (k = 0; k < BLOCK_SIZE; ++k) {
C[((i + k) * n) + j] = cblas_sdot(n, &A[(a_row + i + k) * a_size + a_col], incx, &temp[n * j], 1);
}
}
}
free(temp);
}
On my machine, this code is about 1 order of magnitude faster than the the 3 loop code (but also 1 order of magnitude slower than cblas_?gemm call) for single precision floats and 2K by 2K matrices. (I'm using Intel MKL).

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

OpenACC and CUDA aware MPI - gpu

Related

Sha256 opencl kernel needed [Help needed]

How to not parallelize inner loops in OpenACC

Matrices multiply, Cannon algorithm implementation using MPI

OpenCL kernel doesn't finish executing

dot product using cblas is slow

Categories

Resources