i need a Sha256 kernel file , i am using Cloo as my opencl library , it will be included in WPF project
i am calculating a hash value several times
the program needs about an 30 mins or so to do that but my search result claimed opencl will reduce that time to under 3 mins or less
thanks in advance
[Edit]
ok now i managed to do it using this
https://searchcode.com/file/45893396/src/opencl/sha256_kernel.cl/
but it works fine with string
yet when sending my byteArray header to hash it returned a very different value than expected
[Edit2]
it can not handle large arrays any array more than 32 length returns missy results
Found this and i modified it to calculate double hash
if anyone needs it
#ifndef uint8_t
#define uint8_t unsigned char
#endif
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#ifndef uint64_t
#define uint64_t unsigned long int
#endif
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
typedef struct
{
uint32_t state[8];
uint64_t count;
uint8_t buffer[64];
} CSha256;
inline void Sha256_Init(CSha256 *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch2(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define sha_a(i) T[(0-(i))&7]
#define sha_b(i) T[(1-(i))&7]
#define sha_c(i) T[(2-(i))&7]
#define sha_d(i) T[(3-(i))&7]
#define sha_e(i) T[(4-(i))&7]
#define sha_f(i) T[(5-(i))&7]
#define sha_g(i) T[(6-(i))&7]
#define sha_h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch2(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, i+1); \
R(g,h,a,b,c,d,e,f, i+2); \
R(f,g,h,a,b,c,d,e, i+3); \
R(e,f,g,h,a,b,c,d, i+4); \
R(d,e,f,g,h,a,b,c, i+5); \
R(c,d,e,f,g,h,a,b, i+6); \
R(b,c,d,e,f,g,h,a, i+7)
#else
#define R(i) sha_h(i) += S1(sha_e(i)) + Ch2(sha_e(i),sha_f(i),sha_g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
sha_d(i) += sha_h(i); sha_h(i) += S0(sha_a(i)) + Maj(sha_a(i), sha_b(i), sha_c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
inline static void Sha256_Transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16];
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
inline static void Sha256_WriteByteBlock(CSha256 *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
Sha256_Transform(p->state, data32);
}
inline void Sha256_Update(CSha256 *p, __global const uint8_t *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final(CSha256 *p, __global uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
inline void Sha256_Update1(CSha256 *p, const uint8_t *data, uint32_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final1(CSha256 *p, uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
__kernel void Sha256_1(__global uint8_t *header,__global uint8_t *toRet)
{
uint8_t tempHdr[80];
uint8_t tempDigest[32]={0};
uint startNon=toRet[0] + (toRet[1] << 8) + (toRet[2] << 16) + (toRet[3] << 24);
uint maxNon=toRet[4] + (toRet[5] << 8) + (toRet[6] << 16) + (toRet[7] << 24);
uint nonce =startNon;
uint32_t finalNon=0;
uint8_t match=0;
for(int x=0;x<80;x++)
tempHdr[x]=header[x];
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
while(finalNon<1)
{
CSha256 p;
Sha256_Init(&p);
Sha256_Update1(&p, tempHdr, 80);
Sha256_Final1(&p, tempDigest);
CSha256 p1;
Sha256_Init(&p1);
Sha256_Update1(&p1, tempDigest, 32);
Sha256_Final1(&p1, tempDigest);
for(int x=31;x>21;x--)
{
if(tempDigest[x]<1) match++;
}
if(match>8)
{
finalNon=nonce;
toRet[8] = (char)(nonce);
toRet[9] = (char)(nonce >> 8);
toRet[10] = (char)(nonce >> 16);
toRet[11] = (char)(nonce >> 24);
}
else
{
nonce++;
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
}
match=0;
if(nonce>maxNon) break;
if(nonce<=startNon) break;
}
}
I want to move on the device the whole while loop in the main. The problems emerges when I add #pragma acc host_data use_device(err) to MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);.
The error is that the reduction on err doesn't work so that the code exit after one step from the loop.
After the MPI_Allreduce(), even using #pragma acc update self(err), err is still equal to zero.
I'm compiling with mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c
And running with mpirun -np 2 -mca pml ^ucx ./a.out
Could you help me to find the error?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
err = err_glob;
#endif
// #pragma acc update host(err)
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}
Thanks for updating the example. There's a few issues here.
First, for "err" and "err_glob". At the beginning of the loop, you set "err=0" on the host but don't update it on the device. Then after the MPI_AllReduce call, you set "err=err_glob", again on the host, so need to update "err_glob".
The second issue is that the code is getting partially present errors for "y" when run with multiple ranks. The problem being you're using the global size not the local size for "x" and "y" so when you copy "y" it overlaps with "x" due to the offsets. I fixed this by copying "xg" and "yg" to the device instead.
As for performance relative to the CPU, the main problem here is that the size is small so the code severly under utilizes the GPU. I increased the GLOB sizes to 4096 and see better relative performance, though the code converges much faster.
I also took the liberty of adding some boiler plate code that I use for rank to device assignment so the code can take advantage of multiple GPUs.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
#ifdef _OPENACC
#include <openacc.h>
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
int xsize,ysize;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
#ifdef _OPENACC
/* -------------------------------------------------------
0. Set the device for each rank
------------------------------------------------------- */
int device_type, num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
// Get the local rank number
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
// Device num = local rank mod number of devices on the node
device_type = acc_get_device_type();
num_devices = acc_get_num_devices(device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, device_type);
acc_init(device_type);
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
//#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc update device(err)
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
#pragma acc update host(err_glob)
err = err_glob;
#endif
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}