Matrices multiply, Cannon algorithm implementation using MPI - process

First of all, of course I saw similar questions and solutions, but my implementation is a little bit different.
The main problem is that, the my code works only for one process, but it doesn't work for more processes.
I don't know what is the cause of this... Probably in communication between processes but I can't figure it out ;/
#include <mpi.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
using namespace std;
int main(int argc, char **argv)
{
int x = 0;
double kk;
int proces;
int numprocs;
int right_neigh, left_neigh, up_neigh, down_neigh;
int tag = 99;
static const int n = 6; //size of matrices
int psa[n][n]; //nxn
int psb[n][n];
int pra[n][n];
int prb[n][n];
int c[n][n];
for (int i = 0; i < n; i++) { //let's make fist matrix
for (int j = 0; j < n; j++) {
psa[i][j] = (int)rand() % 100 + 1;
psb[i][j] = (int)rand() % 100 + 1;
c[i][j] = 0;
}
}
for (int i = 0; i < n; i++) { //an the 2nd one
for (int j = 0; j < n; j++) {
pra[i][j] = psa[i][j];
prb[i][j] = psb[i][j];
}
}
MPI_Status statRecv[2];
MPI_Request reqSend[2], reqRecv[2];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &proces);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
int PP = numprocs;
double np = numprocs;
kk = sqrt(np);
int k = (int)kk;
if (proces < k) // below neighbour set
{
left_neigh = (proces + k - 1) % k;
right_neigh = (proces + k + 1) % k;
up_neigh = ((k - 1)*k) + proces;
}
if (proces == k)
{
left_neigh = ((proces + k - 1) % k) + k;
right_neigh = ((proces + k + 1) % k) + k;
up_neigh = proces - k;
}
if (proces > k)
{
x = proces / k;
left_neigh = ((proces + k - 1) % k) + x * k;
right_neigh = ((proces + k + 1) % k) + x * k;
up_neigh = proces - k;
}
if (proces == 0 || (proces / k) < (k - 1))
{
down_neigh = proces + k;
}
if ((proces / k) == (k - 1))
{
down_neigh = proces - ((k - 1)*k);
}
x = 0;
for(int kk = 0; kk < PP; kk++) //algorithm
{
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
for (int k = 0; k < n / PP; k++)
{
c[i][j] += psa[i][k] * psb[k][j];
}
}
}
MPI_Irecv(pra, n*n / PP / PP,MPI_FLOAT,left_neigh, tag,MPI_COMM_WORLD, reqRecv);
MPI_Irecv(prb, n*n / PP / PP,MPI_FLOAT,down_neigh,tag,MPI_COMM_WORLD,&reqRecv[1]);
MPI_Isend(psa, n*n / PP / PP,MPI_FLOAT,right_neigh,tag,MPI_COMM_WORLD, reqSend);
MPI_Isend(psb, n*n / PP / PP,MPI_FLOAT,up_neigh,tag,MPI_COMM_WORLD,&reqSend[1]);
MPI_Wait(reqRecv, statRecv);
}
cout << "A" << endl; //show result
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cout << pra[i][j] << " ";
}
cout << endl;
}
cout << "B" << endl;
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cout << prb[i][j] << " ";
}
cout << endl;
}
cout << "C" << endl;
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
cout << c[i][j] << " ";
}
cout << endl;
}
MPI_Finalize();
return 0;
}

Ok I made it. Now everything is cool, my friend helped me out. But Admin please do not remove it, it can be helpful for someone.
#include <mpi.h>
#include <stdio.h>
#include <math.h>
#include <iostream> using namespace std; int main(int argc, char **argv) { int x = 0; double kk; int proces; int numprocs; int prawy_sasiad, lewy_sasiad, gorny_sasiad, dolny_sasiad; int tag = 99;
static const int n = 4; //rozmiar tablic
const int PP = 2; // pierwiastek z liczby procesow
int A[n][n] = {}, B[n][n] = {};
for (int i = 0; i < n; i++) {//inicjalizacja macierzy glownych
for (int j = 0; j < n; j++) {
A[i][j] = (int)rand() % 100 + 1;
B[i][j] = (int)rand() % 100 + 1;
}
}
/*
int val = 1;
for (int i = 0; i < n; i++) { //inicjalizacja macierzy glownych
for (int j = 0; j < n; j++) {
A[i][j] = val;
B[i][j] = val;
val++;
}
}
*/
MPI_Status statRecv2;
MPI_Request reqSend2, reqRecv2;
MPI_Status statRecv[2];
MPI_Request reqSend[2], reqRecv[2];
MPI_Init(0, 0);
MPI_Comm_rank(MPI_COMM_WORLD, &proces);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
int pra[n / PP][n / PP] = {}, psa[n / PP][n / PP] = {};// podmacierze
int prb[n / PP][n / PP] = {}, psb[n / PP][n / PP] = {};
//int C[n / PP][n / PP] = {};//wynikowa
int C[n][n] = {};//wynikowa
//cout << proces << endl;
for (int i = 0; i < n / PP; i++)//podzielenie macierzy glownej na podmacierze, kazdy proces otrzymuje inna podmacierz
{
for (int j = 0; j < n / PP; j++)
{
psa[i][j] = A[proces / PP*(n / PP) + i][proces%PP*(n / PP) + j];
psb[i][j] = B[proces / PP*(n / PP) + i][proces%PP*(n / PP) + j];
//cout << A[proces / PP*(n / PP) + i][proces%PP*(n / PP) + j] << " ";
}
//cout << endl;
}
double np = numprocs;
kk = sqrt(np);
int k = (int)kk;
if (proces < k) // ustawienie sasiadow
{
lewy_sasiad = (proces + k - 1) % k;
prawy_sasiad = (proces + k + 1) % k;
gorny_sasiad = ((k - 1)*k) + proces;
}
if (proces == k)
{
lewy_sasiad = ((proces + k - 1) % k) + k;
prawy_sasiad = ((proces + k + 1) % k) + k;
gorny_sasiad = proces - k;
}
if (proces > k)
{
x = proces / k;
lewy_sasiad = ((proces + k - 1) % k) + x * k;
prawy_sasiad = ((proces + k + 1) % k) + x * k;
gorny_sasiad = proces - k;
}
if (proces == 0 || (proces / k) < (k - 1))
{
dolny_sasiad = proces + k;
}
if ((proces / k) == (k - 1))
{
dolny_sasiad = proces - ((k - 1)*k);
}
x = 0;
int p = 0;
do{ //przesuniecia
if (p < proces / PP)// w wierszu
{
MPI_Irecv(pra, n*n / PP / PP, MPI_FLOAT, prawy_sasiad, tag, MPI_COMM_WORLD, &reqRecv2);
MPI_Isend(psa, n*n / PP / PP, MPI_FLOAT, lewy_sasiad, tag, MPI_COMM_WORLD, &reqSend2);
MPI_Wait(&reqRecv2, &statRecv2);
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psa[i][j] = pra[i][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
if (p < proces % PP)// i w kolumnie
{
MPI_Irecv(prb, n*n / PP / PP, MPI_FLOAT, dolny_sasiad, tag, MPI_COMM_WORLD, &reqRecv2);
MPI_Isend(psb, n*n / PP / PP, MPI_FLOAT, gorny_sasiad, tag, MPI_COMM_WORLD, &reqSend2);
MPI_Wait(&reqRecv2, &statRecv2);
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psb[i][j] = prb[i][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
p++;
} while (p < n);
//MPI_Barrier(MPI_COMM_WORLD);
for (int kkk = 0; kkk < PP; kkk++) //algorytm
{
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
for (int k = 0; k < n / PP; k++)
{
C[i][j] += psa[i][k] * psb[k][j];
}
}
}
MPI_Irecv(pra, n*n / PP / PP, MPI_FLOAT, prawy_sasiad, tag, MPI_COMM_WORLD, reqRecv);
MPI_Irecv(prb, n*n / PP / PP, MPI_FLOAT, dolny_sasiad, tag, MPI_COMM_WORLD, &reqRecv[1]);
MPI_Isend(psa, n*n / PP / PP, MPI_FLOAT, lewy_sasiad, tag, MPI_COMM_WORLD, reqSend);
MPI_Isend(psb, n*n / PP / PP, MPI_FLOAT, gorny_sasiad, tag, MPI_COMM_WORLD, &reqSend[1]);
MPI_Wait(reqRecv, statRecv);
MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psa[i][j] = pra[i][j];
}
}
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
psb[i][j] = prb[i][j];
}
}
}
cout << "Proces: " << proces << " ";
for (int i = 0; i < n / PP; i++)
{
for (int j = 0; j < n / PP; j++)
{
cout << C[i][j] << " ";
}
}
MPI_Finalize();
return 0;
}

Related

Sha256 opencl kernel needed [Help needed]

i need a Sha256 kernel file , i am using Cloo as my opencl library , it will be included in WPF project
i am calculating a hash value several times
the program needs about an 30 mins or so to do that but my search result claimed opencl will reduce that time to under 3 mins or less
thanks in advance
[Edit]
ok now i managed to do it using this
https://searchcode.com/file/45893396/src/opencl/sha256_kernel.cl/
but it works fine with string
yet when sending my byteArray header to hash it returned a very different value than expected
[Edit2]
it can not handle large arrays any array more than 32 length returns missy results
Found this and i modified it to calculate double hash
if anyone needs it
#ifndef uint8_t
#define uint8_t unsigned char
#endif
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#ifndef uint64_t
#define uint64_t unsigned long int
#endif
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
typedef struct
{
uint32_t state[8];
uint64_t count;
uint8_t buffer[64];
} CSha256;
inline void Sha256_Init(CSha256 *p)
{
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
p->state[3] = 0xa54ff53a;
p->state[4] = 0x510e527f;
p->state[5] = 0x9b05688c;
p->state[6] = 0x1f83d9ab;
p->state[7] = 0x5be0cd19;
p->count = 0;
}
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
#define blk0(i) (W[i] = data[i])
#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
#define Ch2(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define sha_a(i) T[(0-(i))&7]
#define sha_b(i) T[(1-(i))&7]
#define sha_c(i) T[(2-(i))&7]
#define sha_d(i) T[(3-(i))&7]
#define sha_e(i) T[(4-(i))&7]
#define sha_f(i) T[(5-(i))&7]
#define sha_g(i) T[(6-(i))&7]
#define sha_h(i) T[(7-(i))&7]
#ifdef _SHA256_UNROLL2
#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch2(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
d += h; h += S0(a) + Maj(a, b, c)
#define RX_8(i) \
R(a,b,c,d,e,f,g,h, i); \
R(h,a,b,c,d,e,f,g, i+1); \
R(g,h,a,b,c,d,e,f, i+2); \
R(f,g,h,a,b,c,d,e, i+3); \
R(e,f,g,h,a,b,c,d, i+4); \
R(d,e,f,g,h,a,b,c, i+5); \
R(c,d,e,f,g,h,a,b, i+6); \
R(b,c,d,e,f,g,h,a, i+7)
#else
#define R(i) sha_h(i) += S1(sha_e(i)) + Ch2(sha_e(i),sha_f(i),sha_g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
sha_d(i) += sha_h(i); sha_h(i) += S0(sha_a(i)) + Maj(sha_a(i), sha_b(i), sha_c(i))
#ifdef _SHA256_UNROLL
#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
#endif
#endif
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
inline static void Sha256_Transform(uint32_t *state, const uint32_t *data)
{
uint32_t W[16];
unsigned j;
#ifdef _SHA256_UNROLL2
uint32_t a,b,c,d,e,f,g,h;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
#else
uint32_t T[8];
for (j = 0; j < 8; j++)
T[j] = state[j];
#endif
for (j = 0; j < 64; j += 16)
{
#if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
RX_8(0); RX_8(8);
#else
unsigned i;
for (i = 0; i < 16; i++) { R(i); }
#endif
}
#ifdef _SHA256_UNROLL2
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
#else
for (j = 0; j < 8; j++)
state[j] += T[j];
#endif
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
/* memset(T, 0, sizeof(T)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
inline static void Sha256_WriteByteBlock(CSha256 *p)
{
uint32_t data32[16];
unsigned i;
for (i = 0; i < 16; i++)
data32[i] =
((uint32_t)(p->buffer[i * 4 ]) << 24) +
((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
((uint32_t)(p->buffer[i * 4 + 2]) << 8) +
((uint32_t)(p->buffer[i * 4 + 3]));
Sha256_Transform(p->state, data32);
}
inline void Sha256_Update(CSha256 *p, __global const uint8_t *data, size_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final(CSha256 *p, __global uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
inline void Sha256_Update1(CSha256 *p, const uint8_t *data, uint32_t size)
{
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
while (size > 0)
{
p->buffer[curBufferPos++] = *data++;
p->count++;
size--;
if (curBufferPos == 64)
{
curBufferPos = 0;
Sha256_WriteByteBlock(p);
}
}
}
inline void Sha256_Final1(CSha256 *p, uint8_t *digest)
{
uint64_t lenInBits = (p->count << 3);
uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
unsigned i;
p->buffer[curBufferPos++] = 0x80;
while (curBufferPos != (64 - 8))
{
curBufferPos &= 0x3F;
if (curBufferPos == 0)
Sha256_WriteByteBlock(p);
p->buffer[curBufferPos++] = 0;
}
for (i = 0; i < 8; i++)
{
p->buffer[curBufferPos++] = (uint8_t)(lenInBits >> 56);
lenInBits <<= 8;
}
Sha256_WriteByteBlock(p);
for (i = 0; i < 8; i++)
{
*digest++ = (uint8_t)(p->state[i] >> 24);
*digest++ = (uint8_t)(p->state[i] >> 16);
*digest++ = (uint8_t)(p->state[i] >> 8);
*digest++ = (uint8_t)(p->state[i]);
}
Sha256_Init(p);
}
__kernel void Sha256_1(__global uint8_t *header,__global uint8_t *toRet)
{
uint8_t tempHdr[80];
uint8_t tempDigest[32]={0};
uint startNon=toRet[0] + (toRet[1] << 8) + (toRet[2] << 16) + (toRet[3] << 24);
uint maxNon=toRet[4] + (toRet[5] << 8) + (toRet[6] << 16) + (toRet[7] << 24);
uint nonce =startNon;
uint32_t finalNon=0;
uint8_t match=0;
for(int x=0;x<80;x++)
tempHdr[x]=header[x];
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
while(finalNon<1)
{
CSha256 p;
Sha256_Init(&p);
Sha256_Update1(&p, tempHdr, 80);
Sha256_Final1(&p, tempDigest);
CSha256 p1;
Sha256_Init(&p1);
Sha256_Update1(&p1, tempDigest, 32);
Sha256_Final1(&p1, tempDigest);
for(int x=31;x>21;x--)
{
if(tempDigest[x]<1) match++;
}
if(match>8)
{
finalNon=nonce;
toRet[8] = (char)(nonce);
toRet[9] = (char)(nonce >> 8);
toRet[10] = (char)(nonce >> 16);
toRet[11] = (char)(nonce >> 24);
}
else
{
nonce++;
tempHdr[76] = (char)(nonce);
tempHdr[77] = (char)(nonce >> 8);
tempHdr[78] = (char)(nonce >> 16);
tempHdr[79] = (char)(nonce >> 24);
}
match=0;
if(nonce>maxNon) break;
if(nonce<=startNon) break;
}
}

Algorithm for computing inverse polynomials in NTRUEncrypt

I'm implementing in code an algorithm for computing inverse polynomials in the NTRU cryptosystem, and I'm using the paper "Almost Inverses and Fast NTRU Key Creation" by Joseph H. Silverman. I implemented the second pseudo-code as:
int inverse_mod_p(polynomial *r, polynomial *a)
{
int k;
int16_t b[NTRU_N + 1], c[NTRU_N + 1], f[NTRU_N + 1], g[NTRU_N + 1];
int i;
int16_t aux;
int zero_f;
int constant_f;
int deg_fg;
memset(b, 0, (NTRU_N + 1) * sizeof(int16_t));
b[0] = 1;
memset(c, 0, (NTRU_N + 1) * sizeof(int16_t));
memcpy(f, a->coeffs, NTRU_N * sizeof(int16_t));
f[NTRU_N] = 0;
memset(g, 0, (NTRU_N + 1) * sizeof(int16_t));
g[0] = -1;
g[NTRU_N] = 1;
while (1)
{
zero_f = 1;
for (i = 0; i < NTRU_N + 1; i++)
{
if (f[i] != 0)
{
zero_f = 0;
break;
}
}
if (zero_f)
return 1;
while (f[0] == 0)
{
for (i = 0; i < NTRU_N; i++)
{
f[i] = f[i + 1];
c[NTRU_N - i] = c[NTRU_N - i - 1];
}
f[NTRU_N] = 0;
c[0] = 0;
k++;
}
constant_f = 1;
for (i = 1; i < NTRU_N + 1; i++)
{
if (f[i] != 0)
{
constant_f = 0;
break;
}
}
if (constant_f)
break;
deg_fg = 0;
for (i = NTRU_N; i >= 0; i--)
{
if (f[i] == 0 && g[i] != 0)
{
deg_fg = 1;
break;
}
else if (f[i] != 0 && g[i] == 0)
{
break;
}
}
if (deg_fg)
{
for (i = 0; i < NTRU_N + 1; i++)
{
aux = f[i];
f[i] = g[i];
g[i] = aux;
aux = b[i];
b[i] = c[i];
c[i] = aux;
}
}
if (f[0] == g[0])
{
for (i = 0; i < NTRU_N + 1; i++)
{
f[i] = (f[i] - g[i] + 3) % 3;
b[i] = (b[i] - c[i] + 3) % 3;
}
}
else
{
for (i = 0; i < NTRU_N + 1; i++)
{
f[i] = (f[i] + g[i] + 3) % 3;
b[i] = (b[i] + c[i] + 3) % 3;
}
}
}
k = k % NTRU_N;
for (i = NTRU_N - 1; i >= 0; i--)
{
if (i - k < 0)
r->coeffs[i - k + NTRU_N] = b[i] * f[0];
else
r->coeffs[i - k] = b[i] * f[0];
}
for (i = 0; i < NTRU_N; i++)
r->coeffs[i] = (r->coeffs[i] + 3) % 3;
return 0;
}
But this seems to be wrong. I tested it using the example giveng in Wikipedia: https://en.wikipedia.org/wiki/NTRUEncrypt . The polynomial -1 + x + x^2 - x^4 + x^6 + x^9 - x^10 should have as inverse the polynomial 1 + 2x + 2x^3 + 2x^4 + x^5 + 2x^7 + x^8 - x^10 , but I got the following result:
Polinomial:
-1 1 1 0 -1 0 1 0 0 1 -1
Inverse polinomial:
0 2 2 1 0 2 1 2 0 1 2
Where is the error in the implementation?

OpenACC and CUDA aware MPI

I want to move on the device the whole while loop in the main. The problems emerges when I add #pragma acc host_data use_device(err) to MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);.
The error is that the reduction on err doesn't work so that the code exit after one step from the loop.
After the MPI_Allreduce(), even using #pragma acc update self(err), err is still equal to zero.
I'm compiling with mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c
And running with mpirun -np 2 -mca pml ^ucx ./a.out
Could you help me to find the error?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
err = err_glob;
#endif
// #pragma acc update host(err)
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}
Thanks for updating the example. There's a few issues here.
First, for "err" and "err_glob". At the beginning of the loop, you set "err=0" on the host but don't update it on the device. Then after the MPI_AllReduce call, you set "err=err_glob", again on the host, so need to update "err_glob".
The second issue is that the code is getting partially present errors for "y" when run with multiple ranks. The problem being you're using the global size not the local size for "x" and "y" so when you copy "y" it overlaps with "x" due to the offsets. I fixed this by copying "xg" and "yg" to the device instead.
As for performance relative to the CPU, the main problem here is that the size is small so the code severly under utilizes the GPU. I increased the GLOB sizes to 4096 and see better relative performance, though the code converges much faster.
I also took the liberty of adding some boiler plate code that I use for rank to device assignment so the code can take advantage of multiple GPUs.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#ifdef PARALLEL
#include <mpi.h>
MPI_Comm MPI_COMM_CART;
#endif
#ifdef _OPENACC
#include <openacc.h>
#endif
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
{
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
int xsize,ysize;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
#ifdef PARALLEL
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
DomainDecomposition(&mpi_decomp);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
#else
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif
#ifdef _OPENACC
/* -------------------------------------------------------
0. Set the device for each rank
------------------------------------------------------- */
int device_type, num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
// Get the local rank number
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
// Device num = local rank mod number of devices on the node
device_type = acc_get_device_type();
num_devices = acc_get_num_devices(device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, device_type);
acc_init(device_type);
#endif
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
#ifdef PARALLEL
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
#else
x = xg;
y = yg;
#endif
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
}}
#ifdef PARALLEL
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
#endif
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
//#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc update device(err)
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
}}
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
}}
#ifdef PARALLEL
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
{
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
#pragma acc update host(err_glob)
err = err_glob;
#endif
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
}
k++;
}
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
#ifdef PARALLEL
MPI_Finalize();
#endif
return 0;
}
#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
*
*********************************************************************** */
{
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
MPI_Finalize();
exit(1);
}
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
}
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
0, &MPI_COMM_CART);
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
}
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
}
}
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
return;
}
#endif
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
#ifdef PARALLEL
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
}}
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
}}
*/
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
{
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
#endif
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
}
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
}
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
}
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
}
return;
#ifdef PARALLEL
// Print
MPI_Barrier(MPI_COMM_WORLD);
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
}
printf ("\n");
}
}
}
MPI_Finalize();
exit(0);
#endif
}
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
*********************************************************************** */
{
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
MPI_File fh;
MPI_Datatype type_local, type_domain;
int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
#else
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
}
fclose(fp);
#endif
nfile++;
}
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
/*
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
/*
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
{
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
}
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
}
printf ("\n");
}
printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
/*
*********************************************************************** */
{
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
}
printf ("\n");
}
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
}

How to not parallelize inner loops in OpenACC

I am a beginner in doing GPU programming with OpenACC. I was trying to do a direct convolution. Convolution consists of 6 nested loops. I only want the first loop to be parallelized. I gave the pragma #pragma acc loop for the first loop and #pragma acc loop seq for the rest. But the output that I am getting is not correct. Is the approach taken by me to parallelize the loop correct ? Specifications for the convolution: Input channels-3, Input Size- 224X224X3, Output channels- 64, Output Size- 111X111X64, filter size- 3X3X3X64. Following is the link to the header files dog.h and squeezenet_params.h. https://drive.google.com/drive/folders/1a9XRjBTrEFIorrLTPFHS4atBOPrG886i
# include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "squeezenet_params.h"
#include "dog.h"
void conv3x3(
const int input_channels, const int input_size,
const int pad, const int stride, const int start_channel,
const int output_size, const float* restrict input_im, const float* restrict filter_weight,
const float* restrict filter_bias, float* restrict output_im){
#pragma acc data copyin (input_im[0:150527],filter_weight[0:1727],filter_bias[0:63]) copyout(output_im[0:788543])
{
#pragma acc parallel
{
#pragma acc loop
for(int p=0;p<64;++p){
filter_weight += p * input_channels * 9;
float bias = filter_bias[p];
output_im += (start_channel + p) * output_size * output_size;
//loop over output feature map
#pragma acc loop seq
for(int i = 0; i < output_size; i++)
{
#pragma acc loop seq
for(int j = 0; j < output_size; j++)
{
//compute one element in the output feature map
float tmp = bias;
//compute dot product of 2 input_channels x 3 x 3 matrix
#pragma acc loop seq
for(int k = 0; k < input_channels; k++)
{
#pragma acc loop seq
for(int l = 0; l < 3; l++)
{
int h = i * stride + l - pad;
#pragma acc loop seq
for(int m = 0; m < 3; m++)
{
int w = j * stride + m - pad;
if((h >= 0) && (h < input_size) && (w >= 0) && (w < input_size))
{
tmp += input_im[k * input_size * input_size + (i * stride + l - pad) * input_size + j * stride + m - pad] \
* filter_weight[9 * k + 3 * l + m];
}
}
}
}
//add relu activation after conv
output_im[i * output_size + j] = (tmp > 0.0) ? tmp : 0.0;
}
}
}
}
}
}
void main(){
float * result = (float*)malloc(sizeof(float) * (1 * 64 * 111 * 111));
conv3x3(3,224,0,2,0,111,sample,conv1_weight,conv1_bias,result);
for(int i=0;i<64 * 111 * 111;++i){
//if(result[i]>0)
printf("%f:%d\n",result[i],i);
}
}
The contributor posted the same question on the PGI User Forums where I've answered. (See: https://www.pgroup.com/userforum/viewtopic.php?f=4&t=7614). The topic question is incorrect in that the inner loops are not getting parallelized nor are the cause of the issue.
The problem here is that the code has a race condition on the shared "output_im" pointer. My suggested solution is to compute a per thread offset into the array rather than trying to manipulate the pointer itself.
for(int p=0;p<64;++p){
filter_weight += p * input_channels * 9;
float bias = filter_bias[p];
int offset;
offset = (start_channel + p) * output_size * output_size;
//loop over output feature map
#pragma acc loop vector collapse(2)
for(int i = 0; i < output_size; i++)
{
for(int j = 0; j < output_size; j++)
{
... cut ...
}
}
//add relu activation after conv
int idx = offset + (i * output_size + j);
output_im[idx] = (tmp > 0.0) ? tmp : 0.0;
}
}

Is the Time Complexity of this function O(n * (n * log n² ))

What is the Time Complexity of the function below? n > 0
Function fun(n){
Let count = 0;
For( I = 0; I < n; I++){
For(j = 0; j < n; j /= 2) {
For(h = 0; h < n; h /= 2) {
Count = count + 1;
}
}
}
Return count;
}
I have O(n * (n * log n² )) , but something tells me i might be wrong.
The above loop is an infinite loop. time complexity for this cannot be determined, unless the problem statement is updated properly!
Function fun(n){
Let count = 0;
For( I = 0; I < n; I++){
// will run infinitely even if you change j /= 2 to j *= 2, because initial value is 0
For(j = 0; j < n; j /= 2) {
// will run infinitely even if you change h /= 2 to h *= 2, because initial value is 0
For(h = 0; h < n; h /= 2) {
Count = count + 1;
}
}
}
Return count;
}