C-DVM - contents Part 1(1-4) Part2 (5-11) Part 3 (Appendixes)
 created: april, 2001 - last edited 08.10.02 -

Appendix 1. Examples of C-DVM programs.

Six small scientific programs are presented to illustrate C-DVM language features. They are intended for solving a system of linear equations:

A x = b

where A - matrix of coefficients,
b - vector of free members,
x - vector of unknowns.

The following basic methods are used to illustrate CDVM language features.

Direct methods. The well-known Gaussian Elimination method is the most commonly used algorithm of this class. The main idea of this algorithm is to reduce the matrix A to upper triangular form and then to use backward substitution to diagonalize the matrix.

Explicit iteration methods. Jacobi Relaxation is the most known algorithm of this class. The algorithm performs the following computation iteratively

xi,jnew = ( xi-1,jold + xi,j-1old + xi+1,jold + xi,j+1old ) / 4

Implicit iteration methods. Successive Over Relaxation (SOR) belongs to this class. The algorithm performs the following calculation iteratively

xi,jnew = ( w / 4 ) * ( xi-1,jnew + xi,j-1new + xi+1,jold + xi,j+1old ) + (1-w) * xi,jold

By using “red-black” coloring of variables each step of SOR consists of two half Jacobi steps. One processes “red” variables and the other processes “black” variables. Coloring of variables allows to overlap calculation and communication.

Example 1. The Gauss elimination algorithm.

This is a program to solve a system of linear equations Ax = B by Gauss elimination method. The coefficient matrix A is represented by an array section A[0:N-1][0:N-1], and the vector B is represented by the section A[0:N-1][N] of the same array.

```/* GAUSS  program */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)
#define N 10

int main (int argn, char **args)
{
long  i, j, k;

/* declaration of dynamic distributed arrays */
DVM(DISTRIBUTE [BLOCK] [])  float (*A)[N+1];
DVM(DISTRIBUTE [BLOCK])  float (*X);
/* creation of arrays */
A = malloc( N*(N+1)*sizeof(float));
X = malloc( N*sizeof(float));
/* initialize array A*/
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,N-1,1)
DO(j,0,N,1)
if (i==j || j==N)  A[i][j] = 1.f;
else A[i][j]=0.f;

/* elimination */
for (i=0; i<N-1; i++)
{
DVM(PARALLEL [k][j] ON A[k][j]; REMOTE_ACCESS A[i][])
DO (k,i+1,N-1,1)
DO (j,i+1,N,1)
A[k][j] = A[k][j]-A[k][i]*A[i][j]/A[i][i];
}
/* reverse substitution */
X[N-1] = A[N-1][N]/A[N-1][N-1];
for (j=N-2; j>=0; j-=1)
{
DVM(PARALLEL [k] ON A[k][]; REMOTE_ACCESS X[j+1])
DO (k,0,j,1)
A[k][N] = A[k][N]-A[k][j+1]*X[j+1];
X[j]=A[j][N]/A[j][j];
DVM(REMOTE_ACCESS X[j])
printf(“j=%4i  X[j]=%3.3E\n”,j,X[j]);
}
return 0;
}```

Example 2. Jacobi algorithm

```/* JACOBI program */

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define Max(a,b) ((a)>(b)?(a): (b))

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)

#define  L  8
#define  ITMAX 20

int i,j,it,k;
double eps;
double MAXEPS = 0.5;
FILE *f;

/* 2D arrays block distributed along 2 dimensions */
DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L];
DVM(ALIGN[i][j] WITH A[i][j]) double B[L][L];

int main(int argn, char **args)
{
/* 2D loop with base array A */
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,L-1,1)
DO(j,0,L-1,1)
{A[i][j]=0.;
B[i][j]=1.+i+j;
}

/******  iteration loop  *************************/
DO(it,1,ITMAX,1)
{
eps= 0.;
/* Parallel loop with base array A             */
/* calculating maximum in variable eps          */
DVM(PARALLEL [i][j] ON A[i][j]; REDUCTION MAX(eps))
DO(i,1,L-2,1)
DO(j,1,L-2,1)
{eps =  Max(fabs(B[i][j]-A[i][j]),eps);
A[i][j] = B[i][j];
}
/* Parallel loop with base array  B  and         */
/* with prior updating shadow elements of array A  */
DO(i,1,L-2,1)
DO(j,1,L-2,1)
B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4.;

printf(“it=%4i   eps=%3.3E\n”, it,eps);
if (eps < MAXEPS) break;
}/*DO it*/
f=fopen("jacobi.dat","wb");
fwrite(B,sizeof(double),L*L,f);
return 0;
}```

Example 3. Jacobi Algorithm (asynchronous version)

```/* Asynchronous JACOBI */
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define Max(a,b) ((a)>(b)? (a): (b))

#define DVM(dvmdir)
#define DO(v,l,u,s)   for(v=l; v<=u; v+=s)

#define  L 8
#define  ITMAX 20

int i,j,it,k;
double eps;
double MAXEPS   = 0.5;
FILE *f;
/* declare groups for shadow and reduction operations */
DVM(REDUCTION_GROUP) void *emax;
/* 2D arrays block distributed along 2 dimensions */
DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L];
DVM(ALIGN [i][j] WITH A[i][j]) double B[L][L];

int main(int argn, char **args)
{
/* 2D parallel loop with base array A */
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,L-1,1)
DO(j,0,L-1,1)
{A[i][j]=0.;
B[i][j]=1.+i+j;}
/* Create group for shadow operation */

/************  iteration loop  *************************/
DO(it,1,ITMAX,1)
{
eps= 0.;
/* Parallel loop with base array A: */
/* at first elements of array A exported by the */
/* neighbor processor are calculated and sent and */
/* then internal elements of array A are calculated */
DVM(PARALLEL [i][j] ON A[i][j];
DO(i,1,L-2,1)
DO(j,1,L-2,1)
{eps = max(fabs(B[i][j]-A[i][j]),eps);
A[i][j] = B[i][j];
}
/* Start asynchronous calculation of maximum */
DVM(REDUCTION_START emax);
/* Parallel loop with base array B: */
/* internal elements of array B are calculated at first */
/* then completion of array A shadow edges update is */
/* awaited and the loop iterations, requiring shadow */
/* elements of array A are calculated */
DO(i,1,L-2,1)
DO(j,1,L-2,1)
B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4;
/* Wait for completion of reduction */
DVM(REDUCTION_WAIT emax);

printf( "it=%4i  eps=%3.3E\n”, it,eps);
if (eps < MAXEPS) break;
}/*DO it*/

f=fopen("jacobi.dat","wb");
fwrite(B,sizeof(double),L*L,f);
return 0;
}```

Example 4. Red-Black Successive Over-Relaxation

The idea of the method is to paint the points of the grid (the variables) in two colors - red and black - as on a chess-board. Non-rectangular index space is defined by the special form of a loop header: DO (var, a+b%2, ub, 2)

```/* RED-BLACK program */

#include <stdlib.h>
#include <math.h>

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)

#define Max(a,b) ((a)>(b)?(a):(b))
#define ITMAX 10

int main(int an, char ** as)
{long i, j, it, irb;
float eps=0.;
float MAXEPS = 0.5E-5;
float w = 0.5;

/* Groups of asynchronous operations */
DVM(REDUCTION_GROUP) void *rg;

int  N;

/* redefine N as constant */
#define     N 8
DVM(DISTRIBUTE [BLOCK] [BLOCK])  float (*A)[N];
#undef      N /* restore N */

/* "Calculate"  of array size (=8!). Create array */
N = 8;
A = malloc( N*N*sizeof(float));

/* Specification of members of shadow group  */

/* Initialization: parallel loop with surpassing */
/* calculation and sending of exported data */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha)
DO (i,0,N-1,1)
DO (j,0,N-1,1)
if(i==0 || i==N-1 || j==0 || j==N-1) A[i][j]=0.;
else A[i][j] = 1.+i+j;

/* iteration loop */
DO (it,0,ITMAX,1)
{
eps = 0.;

/* Parallel loop with reduction on RED points */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha;
REDUCTION rg: MAX(eps) )
DO (i,1,N-2,1)
DO (j,1+i%2,N-2,2)
{float s;
s = A[i][j];
A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+
A[i][j-1]+A[i][j+1])+(1-w)*A[i][j];
eps = Max (fabs(s-A[i][j]), eps);
}

/* Start red reduction -- as early as possible */
DVM(REDUCTION_START rg);

/* Parallel loop on BLACK points (without reduction) */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha)
DO (i,1,N-2,1)
DO (j,1+(i+1)%2,N-2,2)
{A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+
A[i][j-1]+A[i][j+1])+(1-w)*A[i][j];
}
/* wait red point's reduction -- as late as possible */
DVM(REDUCTION_WAIT rg);

printf("it=%d eps=%e\n",it,eps);
if (eps < MAXEPS) break;

} /* DO it */

free(A);
return 0;
}```
```/* MULTIGRID  program */

#include <stdio.h>
#include <stdlib.h>

#define DVM(dvm)
#define DO(v,l,h,s) for(v=(l); v<=(h); v+=(s))

void oper(
DVM(*DISTRIBUTE [*][*][*])  float *AA,
int AAN, int AAM, int AAK,
DVM(*DISTRIBUTE [*][*][*])  float *BB,
int BBN, int BBM, int BBK)
/*Parameters: two distributed 3D arrays and */
/* their  actual dimensions */
{
#define AA(i,j,k) AA[((i)*AAM+(j))*AAK+(k)]
#define BB(i,j,k) AA[((i)*BBM+(j))*BBK+(k)]
int i, j,k;
/* Alignment of array BB with array AA using stretching*/
DVM(REALIGN BB[i][j][k] WITH AA[2*i][2*j][2*k] NEW);
/* forming array BB from elements of array AA with even indexes */
DVM(PARALLEL [i][j][k] ON BB[i][j][k])
FOR(i,BBN)
FOR(j,BBM)
FOR(k,BBK)
BB(i,j,k)=AA(i*2,j*2,k*2);
#undef AA
#undef BB
}

int main(int argn, char **args)
{
int N1=8,M1=12,K1=16;
int N2,M2,K2;
int Narr=5,Nit=5;
int grid=0;
int ACM,ACK;
int i,j,k;
int step_grid=1;

/* Up to 20 distributed arrays */
DVM(DISTRIBUTE[BLOCK][BLOCK][]) float *A[20];
/* Pointer to current distributed array */
DVM(*DISTRIBUTE[*][*][*]) float *AC;
/* creation of array  A[0] */
A[0]=malloc(N1*M1*K1*sizeof(float));
AC=A[0];
ACM=M1;
ACK=K1;
#define AC(i,j,k) AC[((i)*ACM+(j))*ACK+(k)]
/* initialization of source array */
DVM(PARALLEL [i][j][k] ON AC[i][j][k])
DO(i,0,N1-1,1)
DO(j,0,M1-1,1)
DO(k,0,K1-1,1)
AC(i,j,k)=1.+i+j+k ;
#undef AC

do{
printf("N1=%d M1=%d K1=%d \n”,N1,M1,K1);
N2=N1/2;
M2=M1/2;
K2=K1/2;
grid++;
/* creation of array A[grid] */
A[grid]=malloc(N2*M2*K2*sizeof(float));
oper(A[grid-1],N1,M1,K1,A[grid],N2,M2,K2);
N1=N2;
M1=M2;
K1=K2;
} while (N2>2 && grid<Narr)

for(i=0;i<=grid;i++)
free(A[i]);
return 0;
}```

Example 6. Task Parallelism for Multiblock Code

```/*  PROGRAM TASKS                                 */
/* rectangular grid is distributed on two blocks  */
/*                                                */
/*                     <------- A2,B2 ----->      */
/*                     0    1   2    ... N2       */
/*       <--- A1,B1 -------->                     */
/*       0   1   ...  N1-1  N1  N1+1 ... N1+N2-1  */
/*------------------------------------------------*/
/* 0   | .   .         .    .   .    ...  .       */
/* ... |                                          */
/* K-1 | .   .         .    .   .    ...  .       */
/**************************************************/
#include <stdlib.h>
#include <math.h>

#define DVM(dvmdir)
#define DO(v,l,h,s)   for(v=l; v<=h; v+=s)
#define FOR(v,n)  for(v=0; v<n; v++)
#define Max(a,b) ((a)>(b)?(a):(b))

#define NUMBER_OF_PROCESSORS() 1

#define K   8
#define N1  4
#define N2  K-N1
#define ITMAX 10

DVM(PROCESSORS)  void * P[NUMBER_OF_PROCESSORS()];
double eps0;
DVM(DISTRIBUTE) float (*A1)[K], (*A2)[K];
DVM(ALIGN)      float (*B1)[K], (*B2)[K];

int main(int argn, char** args)
{
int i,j, it;
int NP;

printf("---------- starting --------------\n");

DVM(DEBUG 1 -d0)
{
NP = NUMBER_OF_PROCESSORS() / 2;
}

DVM(MAP MB[0] ONTO P[0:(NP?NP-1:0)]);
DVM(MAP MB[1] ONTO P[NP:(NP?2*NP-1:NP)]);

A1=malloc((N1+1)*K*sizeof(float));
DVM(REDISTRIBUTE A1[][BLOCK] ONTO MB[0]);
B1=malloc((N1+1)*K*sizeof(float));
DVM(REALIGN B1[i][j] WITH A1[i][j]);

A2=malloc((N2+1)*K*sizeof(float));
DVM(REDISTRIBUTE A2[][BLOCK] ONTO MB[1]);
B2=malloc((N2+1)*K*sizeof(float));
DVM(REALIGN B2[i][j] WITH A2[i][j]);

/* Initialization */
DVM(PARALLEL [i][j] ON A1[i][j])
FOR(i,N1+1)
FOR(j,K)
{if (i==0 || j==0 || j==K-1) {
A1[i][j] = 0.f;
B1[i][j] = 0.f;
} else  {
B1[i][j] = 1.f + i + j ;
A1[i][j] = B1[i][j];
}}
DVM(PARALLEL [i][j] ON A2[i][j])
FOR(i,N2+1)
FOR(j,K)
{if(i == N2  || j==0 || j==K-1) {
A2[i][j] = 0.f;
B2[i][j] = 0.f;
} else {
B2[i][j] = 1.f + ( i + N1 - 1 ) + j ;
A2[i][j] = B2[i][j];
}}

DO(it,1,ITMAX,1)
{
eps0 = 0.;

/*     exchange bounds */

DVM(PARALLEL [j] ON A1[(N1)][j]; REMOTE_ACCESS B2[1][])
FOR(j,K) A1[N1][j] = B2[1][j];

DVM(PARALLEL [j] ON A2[0][j]; REMOTE_ACCESS B1[N1-1][])
FOR(j,K) A2[0][j] = B1[N1-1][j];

{
DVM(ON MB[0])
{
double eps=0.; /* private reduction variable */
DVM(PARALLEL [i][j] ON B1[i][j]; SHADOW_RENEW A1[][])
DO(i,1, N1-1, 1)
DO(j,1, K-2, 1)
B1[i][j]=(A1[i-1][j]+A1[i][j-1]+A1[i+1][j]+A1[i][j+1])/4.f;

DVM(PARALLEL [i][j] ON A1[i][j]; REDUCTION MAX(eps))
DO(i,1,N1-1,1)
DO(j,1,K-2,1)
{eps =  Max(fabs(B1[i][j]-A1[i][j]),eps);
A1[i][j] = B1[i][j];
}
eps0=Max(eps0,eps);
}

DVM(ON MB[1])
{
double eps=0.; /* private reduction variable */
DVM(PARALLEL [i][j] ON B2[i][j]; SHADOW_RENEW A2[][])
DO(i,1,N2-1,1)
DO(j,1,K-2,1)
B2[i][j]=(A2[i-1][j]+A2[i][j-1]+A2[i+1][j]+A2[i][j+1])/4.f;

DVM(PARALLEL [i][j] ON A2[i][j]; REDUCTION MAX(eps))
DO(i,1,N2-1,1)
DO(j,1,K-2,1)
{eps =  Max(fabs(B2[i][j]-A2[i][j]),eps);
A2[i][j] = B2[i][j];
}
eps0=Max(eps0,eps);
}