= Ex43.c =
{{{#!C
/*
  Large Scale Computing
  Stokes Flow in a Cavity
  MPI version
  ex43.c
*/
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<time.h>
#include"mpi.h"
#include"stokeslet2d_dist.h"
 
#define EPSILON 0.005 /* blob size*/
#define INTGRID 51    /* # of x-grid lines for internal velocity */
 
/* min and max macros */
#define max(a,b)        (((a) > (b)) ? (a) : (b))
#define min(a,b)        (((a) < (b)) ? (a) : (b))
 
double t_cost(void );
int main(int , char **);
int main(int argc, char **argv){
  double dp;
  int numpdepth;
  int numpwidth;
  int numparticles; 
  double *loc; /* particle location */
  double *vel; /* particle velocity */
  double *foc; /* particle force */
  double *mat; /* 2Nx2N dense matrix */
  double cloc[2]; /* internal points */
  double cvel[2]; /* internal velocity */
  int mystart,myend,mysize;
  int *ostart,*oend;
  int myid,numproc;
  int i,j,li;
  int nyg,cpu;
  FILE *fp;
  char fn[256];
 
  /* Initialize */
  MPI_Init(&argc,&argv);
 
  /* get myid and # of processors */
  MPI_Comm_size(MPI_COMM_WORLD,&numproc);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);
 
  if (argc < 2){
    if (myid == 0) printf("Usage:%s [Depth of Cavity]\n",argv[0]);
    MPI_Abort(MPI_COMM_WORLD, -1);
  }
 
  /* get inputed depth */
  dp = atof(argv[1]);
 
  /* # of particles in depth */
  numpdepth = (int)(dp / EPSILON + 0.5);
 
  /* # of particles in width */
  numpwidth = (int)(1.0 / EPSILON + 0.5);
 
  /* total # od particles */
  numparticles = numpdepth * 2 + numpwidth * 2;
  if (myid == 0) printf("Total # of Particles=%d\n",numparticles);
 
  /* divide blobs into # of cpus */
  mystart = (numparticles / numproc) * myid;
  if (numparticles % numproc > myid){
    mystart += myid;
    myend = mystart + (numparticles / numproc) + 1;
  }else{
    mystart += numparticles % numproc;
    myend = mystart + (numparticles / numproc);
  }
  mysize = myend - mystart;
  printf("CPU%d %d ~ %d : size=%d\n",myid,mystart,myend,mysize);
 
  /* collect start end info */
  ostart = (int *)calloc(numproc, sizeof(int));
  oend = (int *)calloc(numproc, sizeof(int));
  MPI_Allgather(&mysize, 1, MPI_INT, oend, 1, MPI_INT, MPI_COMM_WORLD);  
  ostart[0] = 0;
  for (cpu = 1 ; cpu < numproc ; cpu++){
    ostart[cpu] = ostart[cpu-1] + oend[cpu-1];
    oend[cpu-1] += ostart[cpu-1];
  }
  oend[numproc-1] += ostart[numproc-1];
 
  /* Allocate Space */
  /* DIM=2 defined by 'stokeslet2d.h' */
  loc = (double *)calloc(mysize * DIM,sizeof(double));
  vel = (double *)calloc(mysize * DIM,sizeof(double));
  foc = (double *)calloc(mysize * DIM,sizeof(double));
  mat = (double *)calloc(mysize * DIM * numparticles * DIM,sizeof(double));
  if ((loc == (double *)NULL) ||
      (vel == (double *)NULL) || 
      (foc == (double *)NULL) || 
      (mat == (double *)NULL) ){
    printf("Can't allocate memory!!!\n");
    exit(-1);
  }
 
  /* Set location & velocity of particles (blob)*/
  for (i = mystart ; i < myend ; i++){
    li = i - mystart;
    foc[li * DIM] = 0.0;
    foc[li * DIM + 1] = 0.0;
    if ((i >= 0) && (i <= numpwidth)){ /* top */
      loc[li * DIM] = -0.5 + EPSILON * (double)i; /* x */
      loc[li * DIM + 1] = 0.0; /* y */
      vel[li * DIM] = 1.0;
      vel[li * DIM + 1] = 0.0;
    }else{
      if (i <= (numpwidth + numpdepth)){ /* right wall */      
	loc[li * DIM] = 0.5; /* x */
	loc[li * DIM + 1] = -EPSILON * (double)(i - numpwidth); /* y */	
	vel[li * DIM] = 0.0;
	vel[li * DIM + 1] = 0.0;
      }else{
	if (i <= (2 * numpwidth + numpdepth)){ /* bottom */
	  loc[li * DIM] = 0.5 - EPSILON * (double)(i - (numpwidth + numpdepth)); /* x */
	  loc[li * DIM + 1] = -EPSILON * numpdepth; /* y */
	  vel[li * DIM] = 0.0;
	  vel[li * DIM + 1] = 0.0;	  
	}else{                 /* left wall */    
	  loc[li * DIM] = -0.5; /* x */
	  loc[li * DIM + 1] = -EPSILON * (double)((2 * numpwidth + 2 * numpdepth) - i); /* y */	
	  vel[li * DIM] = 0.0;
	  vel[li * DIM + 1] = 0.0;	  
	}
      }
    }
  }
 
 
  /* make 2Nx2N Matrix */
  t_cost();
  slet2d_mkMatrix_dist(numparticles,ostart,oend,loc,EPSILON,mat);
  if (myid == 0) printf("setting matrix %f sec\n",t_cost());
 
  /* Sovle linear ststem */
  t_cost();
  slet2d_solve_CG_dist(numparticles,ostart,oend,mat,vel,foc);
  if (myid == 0) printf("Sovle linear ststem %f sec\n",t_cost());
 
  /* deallocate big matrix */
  free(mat);
 
  /* out particle (blob) data */
  sprintf(fn,"particle%02d.dat",myid);
  fp = fopen(fn,"w");
  for (i = 0 ; i < mysize ; i++){
    fprintf(fp,"%e %e %e %e %e %e\n",
	    loc[i * DIM],loc[i * DIM + 1],
	    vel[i * DIM],vel[i * DIM + 1],
	    foc[i * DIM],foc[i * DIM + 1]);
  }
  fclose(fp);
 
  /* 
     compute internal velocity 
  */
  t_cost();
  if (myid == 0){
    fp = fopen("res.dat","w");
  }
  nyg = (int)(EPSILON * (double)(numpdepth * (INTGRID - 1)));
  for (j = 0 ; j < nyg ; j++){
    for (i = 0 ; i < INTGRID ; i++){
      cloc[0] = -0.5 + (double)i / (double)(INTGRID - 1);
      cloc[1] = -(double)j / (double)(INTGRID - 1);
      slet2d_velocity_dist(numparticles, ostart,oend,loc, foc, EPSILON, cloc, cvel);        
      if (myid == 0){
	fprintf(fp,"%e %e %e %e\n",cloc[0], cloc[1],cvel[0], cvel[1]);      
      }
    }
  }
  if (myid == 0){
    fclose(fp);  
  }
  if (myid == 0)printf("Compute internal velocity %f sec\n",t_cost());
 
 
  /* free */
  free(loc);
  free(vel);
  free(foc);
 
  printf("CPU%d is Done\n",myid);
 
  MPI_Finalize();
}
 
/* for timing */
double t_cost(void ){
  static clock_t ptm;
  static int fl = 0;
  clock_t tm;
  double sec;
 
  tm = clock();
  if (fl == 0) ptm = tm;
  fl = 1;
 
  sec = (double)(tm - ptm) / (double)CLOCKS_PER_SEC;
 
  ptm = tm;
 
  return sec;
}
stokeslet2d_dist.c

/*
  Large Scale Computing
  Stokeslet for 2D
  MPI Version
 
  Ricardo Cortez,"The Method of Regularized Stokeslets", 
  (2001), SIAM J. Sci. Comput., Vol.23, No.4, pp.1204
 
  assuming mu = 1
*/
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include"mpi.h"
#include"stokeslet2d_dist.h"
 
/* min and max macros */
#define max(a,b)        (((a) > (b)) ? (a) : (b))
#define min(a,b)        (((a) < (b)) ? (a) : (b))
 
double term1(double ,double );
double term2(double ,double );
/* Blas Function */
double ddot_(int *,double *,int *,double *,int *);
double dnrm2_(int *, double *, int *);
 
/* make Matrix */
void slet2d_mkMatrix_dist(int np,int *ostart,int *oend,double *loc,double ep,double *mat){
  int i,j;
  int li,lj;
  double r;
  double dx,dy;
  double tr1,tr2;
  int mysize,maxsize;
  int stend[2];
  int myid,numproc;
  double *oloc,*ploc;
  int cpu;
 
  /* get myid and # of processors */
  MPI_Comm_size(MPI_COMM_WORLD,&numproc);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);
 
  /* get max size */
  maxsize = 0;
  for (cpu = 0 ; cpu < numproc ; cpu++){
    maxsize = max(oend[cpu] - ostart[cpu],maxsize);
  }
  mysize = oend[myid] - ostart[myid];
 
  /* alloc */
  oloc = (double *)calloc(maxsize * DIM,sizeof(double));
 
  /*zeros mat*/
  for (i = 0 ; i < DIM * DIM * np * mysize ; i++) mat[i] = 0.0;
 
  /* make mat */
  for (cpu = 0 ; cpu < numproc ; cpu++){
    /* get location vector from others */
    ploc = oloc;
    if (myid == cpu) ploc = loc; 
    MPI_Bcast(ploc, DIM * (oend[cpu] - ostart[cpu]), MPI_DOUBLE, cpu, MPI_COMM_WORLD);
 
    /* compute local matrix*/
    for (i = ostart[cpu] ; i < oend[cpu] ; i++){    
      for (j = ostart[myid] ; j < oend[myid] ; j++){
	li = i - ostart[cpu];
	lj = j - ostart[myid];
	dx = ploc[li * DIM    ] - loc[lj * DIM    ];
	dy = ploc[li * DIM + 1] - loc[lj * DIM + 1];
	r = sqrt(dx * dx + dy * dy);
 
	tr1 = term1(r,ep) / (4.0 * M_PI);
	tr2 = term2(r,ep) / (4.0 * M_PI); 
 
	mat[i * DIM +      lj * DIM      * np * DIM] += -tr1 + tr2 * dx * dx;
	mat[i * DIM +     (lj * DIM + 1) * np * DIM] +=        tr2 * dx * dy;
	mat[i * DIM + 1 +  lj * DIM      * np * DIM] +=        tr2 * dx * dy;
	mat[i * DIM + 1 + (lj * DIM + 1) * np * DIM] += -tr1 + tr2 * dy * dy;      
      }
    }
  }
 
  free(oloc);
}
 
/* Sovle Liear ststem */
/* CG */
void slet2d_solve_CG_dist(int np, int *ostart, int *oend, double *mat, double *b, double *x){
  double tol = 1.0e-8;
  double *r;
  double *p;
  double *Ap;
  double *work;
  double rr,pAp,rr1,alpha,beta;
  int i,j,count;
  double w,normb;
  int n;
  int ms,me;
  int cpu;
  int myid,numproc;
  int li,lj;
 
  /* get myid and # of processors */
  MPI_Comm_size(MPI_COMM_WORLD,&numproc);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);
 
  n = DIM * np;
  ms = DIM * ostart[myid];
  me = DIM * oend[myid];
 
  /* Allocate Working Spaces */
  r = (double *)calloc(me - ms, sizeof(double));
  p = (double *)calloc(me - ms, sizeof(double));
  Ap = (double *)calloc(me - ms, sizeof(double));
  work = (double *)calloc(n, sizeof(double));
  if ((r == NULL) || 
      (p == NULL) || 
      (Ap == NULL) ||
      (work == NULL)){
    printf("memory allocation failed\n");
    return ;
  }
 
  /* initialize */
  for (i = 0 ; i < me - ms ; i++){
    r[i] = p[i] = Ap[i] = 0.0;
  }
 
  /* |b| */
  w = 0.0;
  for (i = 0 ; i < me - ms ; i++){
    w += b[i] * b[i]; /* w = b.b */ 
  }
  MPI_Allreduce(&w, &normb, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);  
  normb = sqrt(normb);
 
  /* compute r0 = b - Ax */
  for (i = 0 ; i < n ; i++) work[i] = 0.0;
  for (i = ms ; i < me ; i++){
    li = i - ms;
    work[i] = b[li];
  }
  for (j = ms ; j < me ; j++){
    lj = j - ms;
    for (i = 0 ; i < n ; i++){
      work[i] -= mat[i + lj * n] * x[lj];
    }
  }
  for (cpu = 0 ; cpu < numproc ; cpu++){
    MPI_Reduce(&work[DIM * ostart[cpu]], r, DIM * (oend[cpu] - ostart[cpu]), 
 	       MPI_DOUBLE, MPI_SUM, cpu, MPI_COMM_WORLD); 
  }
 
  w = 0.0;
  for (i = 0 ; i < me - ms ; i++){
    p[i] = r[i];  /* p = r */
    w += r[i] * r[i]; /* rr = r.r */ 
  }
  MPI_Allreduce(&w, &rr, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
  /* cg iteration */
  count = 0;
  while(rr  > tol * tol * normb * normb){    
    /* Ap = A*p */
    for (i = 0 ; i < n ; i++) work[i] = 0.0;
    for (j = ms ; j < me ; j++){
      lj = j - ms;
      for (i = 0 ; i < n ; i++){
	work[i] += mat[i + lj * n] * p[lj];
      }
    }
    for (cpu = 0 ; cpu < numproc ; cpu++){
      MPI_Reduce(&work[DIM * ostart[cpu]], Ap, DIM * (oend[cpu] - ostart[cpu]),
		 MPI_DOUBLE, MPI_SUM, cpu, MPI_COMM_WORLD);
    }
 
    /* pAp = p.Ap */ 
    w = 0.0;
    for (i = 0 ; i < me - ms ; i++){
      w += p[i] * Ap[i];
    }
    MPI_Allreduce(&w, &pAp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
    /* alpha = r.r / p.Ap */
    alpha = rr / pAp;
 
    /* Beta */
    for (i = 0 ; i < me - ms ; i++){
      x[i] += alpha * p[i];  /* x += alpha * p */
      r[i] -= alpha * Ap[i]; /* r -= alpha * Ap */
    }    
 
    /* rr1 = r.r */     
    w = 0.0;
    for (i = 0 ; i < me - ms ; i++){
      w += r[i] * r[i];
    }
    MPI_Allreduce(&w, &rr1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
    beta = rr1 / rr;
 
    /* p = r + beta * p */
    for (i = 0 ; i < me - ms ; i++){
      p[i] = r[i] + beta * p[i];
    }
 
    rr = rr1;
    count++;
  }
  if (myid == 0) printf("count=%d\n",count);
 
  /* Deallocate Working Spaces */
  free(r); 
  free(p); 
  free(Ap);
  free(work);
}
 
 
/* compute velocity */
void slet2d_velocity_dist(int np, int *ostart,int *oend,double *loc, double *foc, double ep, 
			  double *cloc,double *cvel){
  int i,p;
  double r;
  double dx,dy;
  double tr1,tr2;
  double ov[2];
  int myid,numproc;
  int li;
 
  /* get myid and # of processors */
  MPI_Comm_size(MPI_COMM_WORLD,&numproc);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);
 
  /* zeros */
  ov[0] = 0.0;
  ov[1] = 0.0;
 
  /* loop for partilces (blob) */
  for (i = ostart[myid] ; i < oend[myid] ; i++){
    li = i - ostart[myid];
    dx = cloc[0] - loc[li * DIM    ];
    dy = cloc[1] - loc[li * DIM + 1];
    r = sqrt(dx * dx + dy * dy);
 
    tr1 = term1(r,ep) / (4.0 * M_PI);
    tr2 = term2(r,ep) / (4.0 * M_PI);       
 
    tr2 *= foc[li * DIM] * dx + foc[li * DIM + 1] * dy; 
 
    ov[0] += -foc[li * DIM    ] * tr1 + tr2 * dx;
    ov[1] += -foc[li * DIM + 1] * tr1 + tr2 * dy;
  }
 
  /* sum up to CPU0 */
  MPI_Reduce(ov, cvel, 2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);  
}
 
double term1(double r,double ep){
  double sq;
 
  sq = sqrt(r * r + ep * ep);
 
  return log(sq + ep) - ep * (sq + 2.0 * ep) / (sq + ep) / sq;
}
 
 
double term2(double r,double ep){
  double sq;
 
  sq = sqrt(r * r + ep * ep);
 
  return (sq + 2.0 * ep) / (sq + ep) / (sq + ep) / sq;
}
}}}