Hello, we have a LP-Model Problem in OPL CPLEX Programm. Our .mod file shows errors that our index values k and t does not exist - optimization

// Indices
int K = ...; // Produktgruppe
int T = ...; // Periode
// Parameter
float a[1..K] = ...; // Produktionskoeffizient für Produktgruppe k für die personelle Kapazität
float b[1..K] = ...; // Produktionskoeffizient für Produktgruppe k für die technische Kapazität
float d[1..K] [1..T] = ...; // Nachfrage nach Produktgruppe p zum Zeitpunkt t
float l[1..K] = ...; // Lagerkostensatz für Produktgruppe k
float u[1..T] = ...; // Überstundenkostensatz in Periode t
int Cmax[1..T] = ...; // Maximale technische Kapazität in Periode t
int Nmax[1..T] = ...; // Maximale personelle Kapazität in Periode t
int Umax[1..T] = ...; // Maximale Übersunden in Periode t
// Entscheidungsvariable
dvar float+ X[1..K] [1..T]; // Produktionsmenge von Produktgruppe k in Periode t
dvar float+ L[1..K] [1..T]; // Lagermenge von Produktgruppe k in Periode t
dvar float+ U[1..T]; // Genutze Überstunden in Periode t
// Zielfunktion
minimize sum (t in 1..T, k in 1..K) (l[k]*L[k][t]) + sum(t in 1..T) (u[t]*U[t]);
// Nebenbedingungen
subject to {
forall(t in 1..T, k in 1..K) error: Aggregation operator FORALL not available for int.
//ctTechnische Kapazitaetsrestriktion:
sum(k in 1..K) (b[k]*X[k][t] <= Cmax[t]); error: Name t does not exist
forall (k in 1..K)
//ctPersonelle Kapazitaetsrestriktion:
a[k]*X[k][t] - U[t] <= Nmax[t]; error: Name t does not exist
forall (t in 1..T)
//ctmaximale Ueberstundenrestriktion:
U[t] <= Umax[t];
forall (t in 1..T) (k in 1..K);
X[k][t] => 0; error: Name k does not exist
forall (t in 1..T)(k in 1..K);
// Lagermengen:
L[k][t] => 0; error: Name k does not exist
forall (t in 1..T) (k in 1..K);
// Ueberstundenrestriktion:
U[t] => 0; error: Name t does not exist
We labeled our Parameters at the beginning right and we know that in the Parameters it is not neccessary to define the index values of t and k. But we receive the same error in the resctriction area of the LP-Model telling us that they do not exist. If anyone finds where the problem is please give us your ideas/solutions.
Kind regards

// Indices
int K = 2; // Produktgruppe
int T = 3; // Periode
// Parameter
float a[1..K] = [1,2]; // Produktionskoeffizient für Produktgruppe k für die personelle Kapazität
float b[1..K] =[1,2]; // Produktionskoeffizient für Produktgruppe k für die technische Kapazität
float d[i in 1..K] [j in 1..T] = i*j; // Nachfrage nach Produktgruppe p zum Zeitpunkt t
float l[1..K] = [1,2]; ; // Lagerkostensatz für Produktgruppe k
float u[1..T] = [1,2,3]; ; // Überstundenkostensatz in Periode t
int Cmax[1..T] = [1,2,3];; // Maximale technische Kapazität in Periode t
int Nmax[1..T] = [1,2,3]; // Maximale personelle Kapazität in Periode t
int Umax[1..T] = [1,2,3]; // Maximale Übersunden in Periode t
// Entscheidungsvariable
dvar float+ X[1..K] [1..T]; // Produktionsmenge von Produktgruppe k in Periode t
dvar float+ L[1..K] [1..T]; // Lagermenge von Produktgruppe k in Periode t
dvar float+ U[1..T]; // Genutze Überstunden in Periode t
// Zielfunktion
minimize sum (t in 1..T, k in 1..K) (l[k]*L[k][t]) + sum(t in 1..T) (u[t]*U[t]);
// Nebenbedingungen
subject to {
forall(t in 1..T) //error: Aggregation operator FORALL not available for int.
//ctTechnische Kapazitaetsrestriktion:
sum(k in 1..K) b[k]*X[k][t] <= Cmax[t]; //error: Name t does not exist
forall (t in 1..T,k in 1..K)
//ctPersonelle Kapazitaetsrestriktion:
a[k]*X[k][t] - U[t] <= Nmax[t]; //error: Name t does not exist
forall (t in 1..T)
//ctmaximale Ueberstundenrestriktion:
U[t] <= Umax[t];
forall (t in 1..T,k in 1..K)
X[k][t] >= 0; //error: Name k does not exist
forall (t in 1..T,k in 1..K)
// Lagermengen:
L[k][t] >=0; //error: Name k does not exist
forall (t in 1..T,k in 1..K)
// Ueberstundenrestriktion:
U[t] >= 0; // error: Name t does not exist
works fine


cplex error: cos float doesnot exists i.e,showing error while typing cos and sin functions

error correction. I have to include cos and sin functions in my model.But it is showing error.I have tried the expression Math.cos and Opl.cos But both won't work. The error is after the forall statement,and iam facing this error after including the cos function.
float c1[0..3]=[50,0,0,50];
float c2[0..3]=[351,0,0,389];
float c3[0..3]=[44.6,0,0,40.6];
float pd[0..3]=[50,170,200,80];
float qd[0..3]=[10,20,30,40];
float V[0..3]=[1.0,1.0,1.0,1.0];
float del[0..3]=[0,0,0,0];
/*float pg[1..4]=[10,0,0,10];*/
float p[0..3];
float q[0..3];
int i=0;
float G[0..3][0..3]=[ [5.724138, -1.724138,0,-4],
float B[0..3][0..3]=[ [-12.31034,4.310345,0,8],
dvar float+ pg[0..3];
dvar float+ Qg[0..3];
minimize sum(i in 0..3)(c1[i]*pg[i]^2 + c2[i]*pg[i] + c3[i]);
subject to
{forall(i in 0..3)
p[i]==V[i]*(sum(j in 0..3)(V[j]*(G[i][j]*cos(del[i]-del[j]))));
forall(i in 0..3)
q[i]==V[i]*(sum(j in 0..3)(V[j]*(G[i][j])));
//forall(i in 0..3)
// pg[i]<=30;
cos is not linear so you cannot use cos in a MIP model within CPLEX.
If you need non linear function you could either use:
CPOptimizer within CPLEX
approximate with a piecewise linear function
But in your case you use cos of data so you can write the following model that works fine:
float c1[0..3]=[50,0,0,50];
float c2[0..3]=[351,0,0,389];
float c3[0..3]=[44.6,0,0,40.6];
float pd[0..3]=[50,170,200,80];
float qd[0..3]=[10,20,30,40];
float V[0..3]=[1.0,1.0,1.0,1.0];
float del[0..3]=[0,0,0,0];
/*float pg[1..4]=[10,0,0,10];*/
float p[0..3];
float q[0..3];
int i=0;
float deltacos[0..3][0..3];
range r=0..3;
execute fill_deltacos
for(var i in r) for (var j in r) deltacos[i][j]=Math.cos(del[i]-del[j]);
float G[0..3][0..3]=[ [5.724138, -1.724138,0,-4],
float B[0..3][0..3]=[ [-12.31034,4.310345,0,8],
dvar float+ pg[0..3];
dvar float+ Qg[0..3];
minimize sum(i in 0..3)(c1[i]*pg[i]^2 + c2[i]*pg[i] + c3[i]);
subject to
{forall(i in 0..3)
p[i]==V[i]*(sum(j in 0..3)(V[j]*(G[i][j]*deltacos[i][j])));
forall(i in 0..3)
q[i]==V[i]*(sum(j in 0..3)(V[j]*(G[i][j])));
//forall(i in 0..3)
// pg[i]<=30;

Operator not available for dvar float+ * float[][range]

int NbPeriods = ...; range Periods = 1..NbPeriods;
int NbParts = ...; range Parts = 1..NbParts;
int NbSuppliers = ...; range Suppliers = 1..NbSuppliers;
int NbProcesses = ...; range Processes = 1..NbProcesses;
int NbPS[1..NbParts, 1..NbProcesses*NbSuppliers] = ...;
float Demand[Parts][Periods] = ...;
float BOH[Parts] = ...;
float Capacity[Suppliers][Processes] = ...;
float ProcessMapping[s in 1..NbSuppliers, pr in 1..NbProcesses, p in 1..NbParts] = NbPS[p, pr+NbProcesses*(s-1)];
float Price[Parts][Suppliers] = ...;
dvar float+ Supply[1..NbParts, 1..NbPeriods*NbSuppliers];
dvar float+ EOH[Parts][Periods];
dvar float+ Util[1..NbProcesses, 1..NbPeriods*NbSuppliers];
sum( t in Periods ) DOIDelta[t] ;
subject to {
forall(p in Parts)
EOH[p][0] == BOH[p];
forall(p in Parts)
forall( t in Periods)
EOH[p][t] == EOH[p][t-1] + sum(s in Suppliers) Supply[p,t+NbPeriods*(s-1)] ;
forall(t in Periods)
forall(pr in Processes)
forall(s in Suppliers)
Util[pr,t+NbPeriods*(s-1)] == sum(p in Parts) (Supply[p,t+NbPeriods*(s-1)] * ProcessMapping[p, pr+NbProcesses*(s-1)] );
The error message is for the last Util line: Operator not available for dvar float+ * float[][range]. I have checked other posts on this topic and the issues are on parenthesis. Even adding the parenthesis the error remains. Appreciate your help.
processMapping is a 3D array not a 2D array
forall(t in Periods)
forall(pr in Processes)
forall(s in Suppliers)
Util[pr,t+NbPeriods*(s-1)] == sum(p in Parts) (Supply[p,t+NbPeriods*(s-1) * ftoi(ProcessMapping[s,p, pr+NbProcesses*(s-1)])] );
will work better

OpenACC and CUDA aware MPI

I want to move on the device the whole while loop in the main. The problems emerges when I add #pragma acc host_data use_device(err) to MPI_Allreduce (&err, &err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);.
The error is that the reduction on err doesn't work so that the code exit after one step from the loop.
After the MPI_Allreduce(), even using #pragma acc update self(err), err is still equal to zero.
I'm compiling with mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c
And running with mpirun -np 2 -mca pml ^ucx ./a.out
Could you help me to find the error?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#include <mpi.h>
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
x = xg;
y = yg;
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[NX_GLOB+2*NGHOST], err, err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
err = err_glob;
// #pragma acc update host(err)
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], err, err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
return 0;
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
*********************************************************************** */
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
*********************************************************************** */
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y[:NY_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
// Print
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
printf ("\n");
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
*********************************************************************** */
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
MPI_File fh;
MPI_Datatype type_local, type_domain;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
*********************************************************************** */
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
printf ("\n");
printf ("------------------------------\n");
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
*********************************************************************** */
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
printf ("\n");
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
Thanks for updating the example. There's a few issues here.
First, for "err" and "err_glob". At the beginning of the loop, you set "err=0" on the host but don't update it on the device. Then after the MPI_AllReduce call, you set "err=err_glob", again on the host, so need to update "err_glob".
The second issue is that the code is getting partially present errors for "y" when run with multiple ranks. The problem being you're using the global size not the local size for "x" and "y" so when you copy "y" it overlaps with "x" due to the offsets. I fixed this by copying "xg" and "yg" to the device instead.
As for performance relative to the CPU, the main problem here is that the size is small so the code severly under utilizes the GPU. I increased the GLOB sizes to 4096 and see better relative performance, though the code converges much faster.
I also took the liberty of adding some boiler plate code that I use for rank to device assignment so the code can take advantage of multiple GPUs.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PARALLEL
#define NX_GLOB 128 /* Global number of interior points */
#define NY_GLOB 128 /* Global number of interior points */
#define NGHOST 1
#define NDIM 2
#include <mpi.h>
#ifdef _OPENACC
#include <openacc.h>
typedef struct MPI_Decomp_{
int nprocs[NDIM]; /* Number of processors in each dimension */
int periods[NDIM]; /* Periodicity flag in each dimension */
int coords[NDIM]; /* Cartesian coordinate in the MPI topology */
int gsize[NDIM]; /* Global domain size (no ghosts) */
int lsize[NDIM]; /* Local domain size (no ghosts) */
int start[NDIM]; /* Local start index in each dimension */
int procL[NDIM]; /* Rank of left-lying process in each direction */
int procR[NDIM]; /* Rank of right-lying process in each direction */
int rank; /* Local process rank */
int size; /* Communicator size */
} MPI_Decomp;
void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);
double **Allocate_2DdblArray(int, int);
int **Allocate_2DintArray(int, int);
void Show_2DdblArray(double **, int, int, const char *);
void Show_2DintArray(int **, int, int, const char *);
int nx_tot, ny_tot;
int main(int argc, char ** argv)
int nx, i, ibeg, iend;
int ny, j, jbeg, jend;
int k, rank=0, size=1;
int xsize,ysize;
double xbeg = 0.0, xend = 1.0;
double ybeg = 0.0, yend = 1.0;
double dx = (xend - xbeg)/(NX_GLOB + 1);
double dy = (yend - ybeg)/(NY_GLOB + 1);
double *xg, *yg, *x, *y, **phi, **phi0;
double err, tol;
MPI_Decomp mpi_decomp;
double err_glob;
int procL[NDIM] = {-1,-1};
int procR[NDIM] = {-1,-1};
/* --------------------------------------------------------
0. Initialize the MPI execution environment
-------------------------------------------------------- */
MPI_Datatype row_type, col_type;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
nx = mpi_decomp.lsize[0];
ny = mpi_decomp.lsize[1];
mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#ifdef _OPENACC
/* -------------------------------------------------------
0. Set the device for each rank
------------------------------------------------------- */
int device_type, num_devices;
int gpuId;
MPI_Comm shmcomm;
int local_rank;
// Get the local rank number
MPI_INFO_NULL, &shmcomm);
MPI_Comm_rank(shmcomm, &local_rank);
// Device num = local rank mod number of devices on the node
device_type = acc_get_device_type();
num_devices = acc_get_num_devices(device_type);
gpuId = local_rank % num_devices;
acc_set_device_num(gpuId, device_type);
/* --------------------------------------------------------
1. Set local grid indices
-------------------------------------------------------- */
ibeg = NGHOST;
iend = ibeg + nx - 1;
nx = iend - ibeg + 1;
nx_tot = nx + 2*NGHOST;
jbeg = NGHOST;
jend = jbeg + ny - 1;
ny = jend - jbeg + 1;
ny_tot = ny + 2*NGHOST;
/* --------------------------------------------------------
2. Generate global and local grids
-------------------------------------------------------- */
xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
x = xg + mpi_decomp.start[0];
y = yg + mpi_decomp.start[1];
x = xg;
y = yg;
/* --------------------------------------------------------
3. Allocate memory on local processor and
assign initial conditions.
-------------------------------------------------------- */
phi = Allocate_2DdblArray(ny_tot, nx_tot);
phi0 = Allocate_2DdblArray(ny_tot, nx_tot);
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = 0.0;
MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
MPI_Type_vector (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
MPI_Type_commit (&row_type);
MPI_Type_commit (&col_type);
/* --------------------------------------------------------
4. Main iteration cycle
-------------------------------------------------------- */
tol = 1.e-5;
err = 1.0;
k = 0;
//#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot], x[:NX_GLOB+2*NGHOST], y[:NX_GLOB+2*NGHOST])
#pragma acc enter data copyin(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
while (err > tol){
/* -- 4a. Set boundary conditions first -- */
BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);
/* -- 4b. Jacobi's method and residual (interior points) -- */
err = 0.0;
#pragma acc update device(err)
#pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = 0.25*( phi0[j][i-1] + phi0[j][i+1]
+ phi0[j-1][i] + phi0[j+1][i] );
err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
#pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot])
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi0[j][i] = phi[j][i];
// double err_glob;
#pragma acc host_data use_device(err, err_glob)
MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#pragma acc update host(err_glob)
err = err_glob;
if (rank == 0){
printf ("k = %d; err = %8.3e\n",k, err);
#pragma acc exit data copyout(phi[:ny_tot][:nx_tot], phi0[:ny_tot][:nx_tot],err,err_glob)
WriteSolution (phi, nx, ny, &mpi_decomp);
return 0;
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
*********************************************************************** */
int dim, i;
int rank, size;
int *coords = mpi_decomp->coords;
int *gsize = mpi_decomp->gsize;
int *lsize = mpi_decomp->lsize;
int *nprocs = mpi_decomp->nprocs;
int *periods = mpi_decomp->periods;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int *start = mpi_decomp->start;
int new_coords[NDIM];
/* --------------------------------------------------------
1. Get rank & size
-------------------------------------------------------- */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
mpi_decomp->rank = rank;
mpi_decomp->size = size;
/* --------------------------------------------------------
2. Obtain number of processor along each dimension.
Use maximally squared decomp.
-------------------------------------------------------- */
nprocs[0] = (int)sqrt(size);
nprocs[1] = size/nprocs[0];
if (nprocs[0]*nprocs[1] != size){
if (rank == 0) printf ("! Cannot decompose\n");
if (rank == 0){
printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
periods[0] = 0;
periods[1] = 0;
/* --------------------------------------------------------
3. Create Cartesian topology
-------------------------------------------------------- */
MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);
/* --------------------------------------------------------
4. Fill structure members
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = NX_GLOB/nprocs[0];
lsize[1] = NY_GLOB/nprocs[1];
start[0] = coords[0]*lsize[0];
start[1] = coords[1]*lsize[1];
/* --------------------------------------------------------
5. Determine ranks of neighbour processors
-------------------------------------------------------- */
for (dim = 0; dim < NDIM; dim++) {
for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];
new_coords[dim] = coords[dim] + 1;
if (new_coords[dim] < nprocs[dim]) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
} else {
procR[dim] = MPI_PROC_NULL;
new_coords[dim] = coords[dim] - 1;
if (new_coords[dim] >= 0) {
MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
} else {
procL[dim] = MPI_PROC_NULL;
/* --------------------------------------------------------
6. Print processor information.
(Use MPI_Bcast() to print in sequence)
-------------------------------------------------------- */
int proc, go;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("[Rank %d]\n",rank);
printf (" coords = [%d, %d], lsize = [%d, %d]\n",
coords[0], coords[1], lsize[0], lsize[1]);
for (dim = 0; dim < NDIM; dim++){
printf (" (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
int nx, int ny, MPI_Decomp *mpi_decomp)
*********************************************************************** */
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
int *procL = mpi_decomp->procL;
int *procR = mpi_decomp->procR;
int rank = mpi_decomp->rank;
int size = mpi_decomp->size;
double send_buf[NX_GLOB + 2*NGHOST];
double recv_buf[NX_GLOB + 2*NGHOST];
/* Used for testing
for (j = 0; j <= jend+1; j++){
for (i = 0; i <= iend+1; i++){
phi[j][i] = -1;
for (j = jbeg; j <= jend; j++){
for (i = ibeg; i <= iend; i++){
phi[j][i] = rank;
#pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
// Left buffer
i = ibeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
// Right buffer
i = iend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
// Bottom buffer
j = jbeg;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
// #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
// Top buffer
j = jend;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], send_buf[:NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
#pragma acc host_data use_device(send_buf, recv_buf)
MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], recv_buf[NX_GLOB+2*NGHOST])
for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST], recv_buf[NX_GLOB+2*NGHOST])
/* -- Left -- */
if (procL[0] < 0){
i = ibeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
/* -- Right -- */
if (procR[0] < 0){
i = iend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], y)
for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
/* -- Bottom -- */
if (procL[1] < 0){
j = jbeg-1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
/* -- Top -- */
if (procR[1] < 0){
j = jend+1;
#pragma acc parallel loop present(phi[:ny_tot][:nx_tot], x)
for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
// Print
int go, proc;
for (proc = 0; proc < size; proc++){
go = proc;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == go) {
printf ("Boundary [Rank %d]\n",rank);
for (j = jend+1; j >= 0; j--){
for (i = 0; i <= iend+1; i++){
printf ("%6.2f ", phi[j][i]);
printf ("\n");
/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
*********************************************************************** */
int i,j;
int ibeg = NGHOST;
int iend = ibeg + nx - 1;
int jbeg = NGHOST;
int jend = jbeg + ny - 1;
static int nfile = 0;
char fname[32];
sprintf (fname,"laplace2D_MPIACC.txt",nfile);
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
phi[j][i] = -1;
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
phi[j][i] = md->rank;
MPI_File fh;
MPI_Datatype type_local, type_domain;
int gsize[2], lsize[2], start[2];
/* --------------------------------------------------------
1. Create a local array type without the ghost zones
This datatype will be passed to MPI_File_write()
-------------------------------------------------------- */
gsize[0] = md->lsize[0] + 2*NGHOST;
gsize[1] = md->lsize[1] + 2*NGHOST;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = NGHOST;
start[1] = NGHOST;
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_Type_commit (&type_local);
/* --------------------------------------------------------
2. Create the subarry in the global domain.
This datatype is used to set the file view.
-------------------------------------------------------- */
gsize[0] = NX_GLOB;
gsize[1] = NY_GLOB;
lsize[0] = md->lsize[0];
lsize[1] = md->lsize[1];
start[0] = lsize[0]*md->coords[0]; // equal to md->start[0]
start[1] = lsize[1]*md->coords[1]; // equal to md->start[1]
MPI_Type_create_subarray (NDIM, gsize, lsize, start,
MPI_Type_commit (&type_domain);
/* --------------------------------------------------------
3. Write to disk
-------------------------------------------------------- */
MPI_File_delete(fname, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
MPI_Type_free (&type_local);
MPI_Type_free (&type_domain);
FILE *fp;
printf ("> Writing %s\n",fname);
fp = fopen(fname, "wb");
for (j = jbeg; j <= jend; j++){
fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
/* ********************************************************************* */
double **Allocate_2DdblArray(int nx, int ny)
* Allocate memory for a double precision array with
* nx rows and ny columns
*********************************************************************** */
int i,j;
double **buf;
buf = (double **)malloc (nx*sizeof(double *));
buf[0] = (double *) malloc (nx*ny*sizeof(double));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
/* ********************************************************************* */
int **Allocate_2DintArray(int nx, int ny)
* Allocate memory for an integer-type array with
* nx rows and ny columns
*********************************************************************** */
int i,j;
int **buf;
buf = (int **)malloc (nx*sizeof(int *));
buf[0] = (int *) malloc (nx*ny*sizeof(int));
for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;
return buf;
/* ********************************************************************* */
void Show_2DdblArray(double **A, int nx, int ny, const char *string)
*********************************************************************** */
int i, j;
printf ("%s\n",string);
printf ("------------------------------\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%8.2f ", A[i][j]);
printf ("\n");
printf ("------------------------------\n");
/* ********************************************************************* */
void Show_2DintArray(int **A, int nx, int ny, const char *string)
*********************************************************************** */
int i, j;
printf ("%s\n",string);
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");
for (i = 0; i < nx; i++) {
for (j = 0; j < ny; j++) {
printf ("%03d ", A[i][j]);
printf ("\n");
for (j = 0; j < ny; j++) printf ("-----");
printf ("\n");

Setup the accelerator framework for fft on the iPhone

I have set a function to setup the accelerator, after i have read :
Using the Apple FFT and Accelerate Framework
iPhone FFT with Accelerate framework vDSP
and apple docs.
i did this :
void fftSetup()
FFTSetup setupReal;
uint32_t log2n;
uint32_t n, nOver2;
int32_t stride;
uint32_t i;
float *originalReal, *obtainedReal;
float scale;
uint32_t L = 1024;
float *mag = new float[L/2];
log2n = 10 ;
n = 1 << log2n;
stride = 1;
nOver2 = n / 2;
printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n);
for (i = 0; i < n; i++)
originalReal[i] = (float) (i + 1);
vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2);
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE);
//get magnitude;
for(i = 1; i < L/2; i++){
mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]);
scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
questions :
my app is always crash with no error(BAD ACCESS) on one of this 2 lines :
originalReal[i] = (float) (i + 1); // or
vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2);
i guess i did not set a good value for log2n ? (10 to get 1024 window ? )
how do i get the real magnitude of the bins? my actual fft? the same i wrote here ?
where do i input MY data buffer array (exactly where in my code ? instead originalReal?)
thanks a lot.
I actually manage to make it work ,when i insert into it a sin wave of a certain f.
This is the code :
FFTSetup setupReal;
uint32_t log2n;
uint32_t n, nOver2;
int32_t stride;
uint32_t i;
float *originalReal, *obtainedReal;
float scale;
uint32_t L = 1024;
float *mag = new float[L/2];
log2n = 10 ;
n = 1 << log2n;
stride = 1;
nOver2 = n / 2;
//printf("1D real FFT of length log2 ( %d ) = %d\n\n", n, log2n);
A.realp = (float *) malloc(nOver2 * sizeof(float));
A.imagp = (float *) malloc(nOver2 * sizeof(float));
originalReal = (float *) malloc(n * sizeof(float));
obtainedReal = (float *) malloc(n * sizeof(float));
for (i = 0; i < n; i++)
originalReal[i] = cos(2*3.141592*11000*i/44100);//(float) (i + 1);
vDSP_ctoz((COMPLEX *) originalReal,2,&A,1,nOver2);
setupReal = vDSP_create_fftsetup(log2n, FFT_RADIX2);
vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_FORWARD);
//vDSP_fft_zrip(setupReal, &A, stride, log2n, FFT_INVERSE);
scale = (float) 1.0 / (2 * n);
vDSP_vsmul(A.realp, 1, &scale, A.realp, 1, nOver2);
vDSP_vsmul(A.imagp, 1, &scale, A.imagp, 1, nOver2);
//get magnitude;
for(i = 1; i < L/2; i++)
mag[i] = sqrtf(A.realp[i]*A.realp[i] + A.imagp[i] * A.imagp[i]);
Actually its not 44hz between bins,as the guy wrote in the post above! but 43 ! 22050/512=43 . this thing is critical ! because in the higher bins- such as bin[300] you get a completely different resault for 44 and 43 ! (its 300hz drift). so take care of that .

What is the better way to implement a perfect hash function for an iOS app?

I need to create a perfect hash for a list of string identifiers, so before to begin with this implementation (i have never did it before) i want to know if there is any good framework or good tutorial that could be useful?
I use the MurmurHash written by Austin Appleby:
unsigned int Hash (const char* buffer, size_t size, unsigned seed)
const unsigned int m = 0x5bd1e995;
const int r = 2;
unsigned int h = seed ^ (unsigned int)size;
const unsigned char* data = (const unsigned char*)buffer;
while(size >= 4)
unsigned int k;
k = data[0];
k |= data[1] << 8;
k |= data[2] << 16;
k |= data[3] << 24;
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
size -= 4;
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
h *= m;
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return h;
But ultimately your choice of hashing function depends on the trade-off between quality and speed.