= Ex43.c = {{{#!C /* Large Scale Computing Stokes Flow in a Cavity MPI version ex43.c */ #include #include #include #include #include"mpi.h" #include"stokeslet2d_dist.h" #define EPSILON 0.005 /* blob size*/ #define INTGRID 51 /* # of x-grid lines for internal velocity */ /* min and max macros */ #define max(a,b) (((a) > (b)) ? (a) : (b)) #define min(a,b) (((a) < (b)) ? (a) : (b)) double t_cost(void ); int main(int , char **); int main(int argc, char **argv){ double dp; int numpdepth; int numpwidth; int numparticles; double *loc; /* particle location */ double *vel; /* particle velocity */ double *foc; /* particle force */ double *mat; /* 2Nx2N dense matrix */ double cloc[2]; /* internal points */ double cvel[2]; /* internal velocity */ int mystart,myend,mysize; int *ostart,*oend; int myid,numproc; int i,j,li; int nyg,cpu; FILE *fp; char fn[256]; /* Initialize */ MPI_Init(&argc,&argv); /* get myid and # of processors */ MPI_Comm_size(MPI_COMM_WORLD,&numproc); MPI_Comm_rank(MPI_COMM_WORLD,&myid); if (argc < 2){ if (myid == 0) printf("Usage:%s [Depth of Cavity]\n",argv[0]); MPI_Abort(MPI_COMM_WORLD, -1); } /* get inputed depth */ dp = atof(argv[1]); /* # of particles in depth */ numpdepth = (int)(dp / EPSILON + 0.5); /* # of particles in width */ numpwidth = (int)(1.0 / EPSILON + 0.5); /* total # od particles */ numparticles = numpdepth * 2 + numpwidth * 2; if (myid == 0) printf("Total # of Particles=%d\n",numparticles); /* divide blobs into # of cpus */ mystart = (numparticles / numproc) * myid; if (numparticles % numproc > myid){ mystart += myid; myend = mystart + (numparticles / numproc) + 1; }else{ mystart += numparticles % numproc; myend = mystart + (numparticles / numproc); } mysize = myend - mystart; printf("CPU%d %d ~ %d : size=%d\n",myid,mystart,myend,mysize); /* collect start end info */ ostart = (int *)calloc(numproc, sizeof(int)); oend = (int *)calloc(numproc, sizeof(int)); MPI_Allgather(&mysize, 1, MPI_INT, oend, 1, MPI_INT, MPI_COMM_WORLD); ostart[0] = 0; for (cpu = 1 ; cpu < numproc ; cpu++){ ostart[cpu] = ostart[cpu-1] + oend[cpu-1]; oend[cpu-1] += ostart[cpu-1]; } oend[numproc-1] += ostart[numproc-1]; /* Allocate Space */ /* DIM=2 defined by 'stokeslet2d.h' */ loc = (double *)calloc(mysize * DIM,sizeof(double)); vel = (double *)calloc(mysize * DIM,sizeof(double)); foc = (double *)calloc(mysize * DIM,sizeof(double)); mat = (double *)calloc(mysize * DIM * numparticles * DIM,sizeof(double)); if ((loc == (double *)NULL) || (vel == (double *)NULL) || (foc == (double *)NULL) || (mat == (double *)NULL) ){ printf("Can't allocate memory!!!\n"); exit(-1); } /* Set location & velocity of particles (blob)*/ for (i = mystart ; i < myend ; i++){ li = i - mystart; foc[li * DIM] = 0.0; foc[li * DIM + 1] = 0.0; if ((i >= 0) && (i <= numpwidth)){ /* top */ loc[li * DIM] = -0.5 + EPSILON * (double)i; /* x */ loc[li * DIM + 1] = 0.0; /* y */ vel[li * DIM] = 1.0; vel[li * DIM + 1] = 0.0; }else{ if (i <= (numpwidth + numpdepth)){ /* right wall */ loc[li * DIM] = 0.5; /* x */ loc[li * DIM + 1] = -EPSILON * (double)(i - numpwidth); /* y */ vel[li * DIM] = 0.0; vel[li * DIM + 1] = 0.0; }else{ if (i <= (2 * numpwidth + numpdepth)){ /* bottom */ loc[li * DIM] = 0.5 - EPSILON * (double)(i - (numpwidth + numpdepth)); /* x */ loc[li * DIM + 1] = -EPSILON * numpdepth; /* y */ vel[li * DIM] = 0.0; vel[li * DIM + 1] = 0.0; }else{ /* left wall */ loc[li * DIM] = -0.5; /* x */ loc[li * DIM + 1] = -EPSILON * (double)((2 * numpwidth + 2 * numpdepth) - i); /* y */ vel[li * DIM] = 0.0; vel[li * DIM + 1] = 0.0; } } } } /* make 2Nx2N Matrix */ t_cost(); slet2d_mkMatrix_dist(numparticles,ostart,oend,loc,EPSILON,mat); if (myid == 0) printf("setting matrix %f sec\n",t_cost()); /* Sovle linear ststem */ t_cost(); slet2d_solve_CG_dist(numparticles,ostart,oend,mat,vel,foc); if (myid == 0) printf("Sovle linear ststem %f sec\n",t_cost()); /* deallocate big matrix */ free(mat); /* out particle (blob) data */ sprintf(fn,"particle%02d.dat",myid); fp = fopen(fn,"w"); for (i = 0 ; i < mysize ; i++){ fprintf(fp,"%e %e %e %e %e %e\n", loc[i * DIM],loc[i * DIM + 1], vel[i * DIM],vel[i * DIM + 1], foc[i * DIM],foc[i * DIM + 1]); } fclose(fp); /* compute internal velocity */ t_cost(); if (myid == 0){ fp = fopen("res.dat","w"); } nyg = (int)(EPSILON * (double)(numpdepth * (INTGRID - 1))); for (j = 0 ; j < nyg ; j++){ for (i = 0 ; i < INTGRID ; i++){ cloc[0] = -0.5 + (double)i / (double)(INTGRID - 1); cloc[1] = -(double)j / (double)(INTGRID - 1); slet2d_velocity_dist(numparticles, ostart,oend,loc, foc, EPSILON, cloc, cvel); if (myid == 0){ fprintf(fp,"%e %e %e %e\n",cloc[0], cloc[1],cvel[0], cvel[1]); } } } if (myid == 0){ fclose(fp); } if (myid == 0)printf("Compute internal velocity %f sec\n",t_cost()); /* free */ free(loc); free(vel); free(foc); printf("CPU%d is Done\n",myid); MPI_Finalize(); } /* for timing */ double t_cost(void ){ static clock_t ptm; static int fl = 0; clock_t tm; double sec; tm = clock(); if (fl == 0) ptm = tm; fl = 1; sec = (double)(tm - ptm) / (double)CLOCKS_PER_SEC; ptm = tm; return sec; } stokeslet2d_dist.c /* Large Scale Computing Stokeslet for 2D MPI Version Ricardo Cortez,"The Method of Regularized Stokeslets", (2001), SIAM J. Sci. Comput., Vol.23, No.4, pp.1204 assuming mu = 1 */ #include #include #include #include"mpi.h" #include"stokeslet2d_dist.h" /* min and max macros */ #define max(a,b) (((a) > (b)) ? (a) : (b)) #define min(a,b) (((a) < (b)) ? (a) : (b)) double term1(double ,double ); double term2(double ,double ); /* Blas Function */ double ddot_(int *,double *,int *,double *,int *); double dnrm2_(int *, double *, int *); /* make Matrix */ void slet2d_mkMatrix_dist(int np,int *ostart,int *oend,double *loc,double ep,double *mat){ int i,j; int li,lj; double r; double dx,dy; double tr1,tr2; int mysize,maxsize; int stend[2]; int myid,numproc; double *oloc,*ploc; int cpu; /* get myid and # of processors */ MPI_Comm_size(MPI_COMM_WORLD,&numproc); MPI_Comm_rank(MPI_COMM_WORLD,&myid); /* get max size */ maxsize = 0; for (cpu = 0 ; cpu < numproc ; cpu++){ maxsize = max(oend[cpu] - ostart[cpu],maxsize); } mysize = oend[myid] - ostart[myid]; /* alloc */ oloc = (double *)calloc(maxsize * DIM,sizeof(double)); /*zeros mat*/ for (i = 0 ; i < DIM * DIM * np * mysize ; i++) mat[i] = 0.0; /* make mat */ for (cpu = 0 ; cpu < numproc ; cpu++){ /* get location vector from others */ ploc = oloc; if (myid == cpu) ploc = loc; MPI_Bcast(ploc, DIM * (oend[cpu] - ostart[cpu]), MPI_DOUBLE, cpu, MPI_COMM_WORLD); /* compute local matrix*/ for (i = ostart[cpu] ; i < oend[cpu] ; i++){ for (j = ostart[myid] ; j < oend[myid] ; j++){ li = i - ostart[cpu]; lj = j - ostart[myid]; dx = ploc[li * DIM ] - loc[lj * DIM ]; dy = ploc[li * DIM + 1] - loc[lj * DIM + 1]; r = sqrt(dx * dx + dy * dy); tr1 = term1(r,ep) / (4.0 * M_PI); tr2 = term2(r,ep) / (4.0 * M_PI); mat[i * DIM + lj * DIM * np * DIM] += -tr1 + tr2 * dx * dx; mat[i * DIM + (lj * DIM + 1) * np * DIM] += tr2 * dx * dy; mat[i * DIM + 1 + lj * DIM * np * DIM] += tr2 * dx * dy; mat[i * DIM + 1 + (lj * DIM + 1) * np * DIM] += -tr1 + tr2 * dy * dy; } } } free(oloc); } /* Sovle Liear ststem */ /* CG */ void slet2d_solve_CG_dist(int np, int *ostart, int *oend, double *mat, double *b, double *x){ double tol = 1.0e-8; double *r; double *p; double *Ap; double *work; double rr,pAp,rr1,alpha,beta; int i,j,count; double w,normb; int n; int ms,me; int cpu; int myid,numproc; int li,lj; /* get myid and # of processors */ MPI_Comm_size(MPI_COMM_WORLD,&numproc); MPI_Comm_rank(MPI_COMM_WORLD,&myid); n = DIM * np; ms = DIM * ostart[myid]; me = DIM * oend[myid]; /* Allocate Working Spaces */ r = (double *)calloc(me - ms, sizeof(double)); p = (double *)calloc(me - ms, sizeof(double)); Ap = (double *)calloc(me - ms, sizeof(double)); work = (double *)calloc(n, sizeof(double)); if ((r == NULL) || (p == NULL) || (Ap == NULL) || (work == NULL)){ printf("memory allocation failed\n"); return ; } /* initialize */ for (i = 0 ; i < me - ms ; i++){ r[i] = p[i] = Ap[i] = 0.0; } /* |b| */ w = 0.0; for (i = 0 ; i < me - ms ; i++){ w += b[i] * b[i]; /* w = b.b */ } MPI_Allreduce(&w, &normb, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); normb = sqrt(normb); /* compute r0 = b - Ax */ for (i = 0 ; i < n ; i++) work[i] = 0.0; for (i = ms ; i < me ; i++){ li = i - ms; work[i] = b[li]; } for (j = ms ; j < me ; j++){ lj = j - ms; for (i = 0 ; i < n ; i++){ work[i] -= mat[i + lj * n] * x[lj]; } } for (cpu = 0 ; cpu < numproc ; cpu++){ MPI_Reduce(&work[DIM * ostart[cpu]], r, DIM * (oend[cpu] - ostart[cpu]), MPI_DOUBLE, MPI_SUM, cpu, MPI_COMM_WORLD); } w = 0.0; for (i = 0 ; i < me - ms ; i++){ p[i] = r[i]; /* p = r */ w += r[i] * r[i]; /* rr = r.r */ } MPI_Allreduce(&w, &rr, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); /* cg iteration */ count = 0; while(rr > tol * tol * normb * normb){ /* Ap = A*p */ for (i = 0 ; i < n ; i++) work[i] = 0.0; for (j = ms ; j < me ; j++){ lj = j - ms; for (i = 0 ; i < n ; i++){ work[i] += mat[i + lj * n] * p[lj]; } } for (cpu = 0 ; cpu < numproc ; cpu++){ MPI_Reduce(&work[DIM * ostart[cpu]], Ap, DIM * (oend[cpu] - ostart[cpu]), MPI_DOUBLE, MPI_SUM, cpu, MPI_COMM_WORLD); } /* pAp = p.Ap */ w = 0.0; for (i = 0 ; i < me - ms ; i++){ w += p[i] * Ap[i]; } MPI_Allreduce(&w, &pAp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); /* alpha = r.r / p.Ap */ alpha = rr / pAp; /* Beta */ for (i = 0 ; i < me - ms ; i++){ x[i] += alpha * p[i]; /* x += alpha * p */ r[i] -= alpha * Ap[i]; /* r -= alpha * Ap */ } /* rr1 = r.r */ w = 0.0; for (i = 0 ; i < me - ms ; i++){ w += r[i] * r[i]; } MPI_Allreduce(&w, &rr1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); beta = rr1 / rr; /* p = r + beta * p */ for (i = 0 ; i < me - ms ; i++){ p[i] = r[i] + beta * p[i]; } rr = rr1; count++; } if (myid == 0) printf("count=%d\n",count); /* Deallocate Working Spaces */ free(r); free(p); free(Ap); free(work); } /* compute velocity */ void slet2d_velocity_dist(int np, int *ostart,int *oend,double *loc, double *foc, double ep, double *cloc,double *cvel){ int i,p; double r; double dx,dy; double tr1,tr2; double ov[2]; int myid,numproc; int li; /* get myid and # of processors */ MPI_Comm_size(MPI_COMM_WORLD,&numproc); MPI_Comm_rank(MPI_COMM_WORLD,&myid); /* zeros */ ov[0] = 0.0; ov[1] = 0.0; /* loop for partilces (blob) */ for (i = ostart[myid] ; i < oend[myid] ; i++){ li = i - ostart[myid]; dx = cloc[0] - loc[li * DIM ]; dy = cloc[1] - loc[li * DIM + 1]; r = sqrt(dx * dx + dy * dy); tr1 = term1(r,ep) / (4.0 * M_PI); tr2 = term2(r,ep) / (4.0 * M_PI); tr2 *= foc[li * DIM] * dx + foc[li * DIM + 1] * dy; ov[0] += -foc[li * DIM ] * tr1 + tr2 * dx; ov[1] += -foc[li * DIM + 1] * tr1 + tr2 * dy; } /* sum up to CPU0 */ MPI_Reduce(ov, cvel, 2, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); } double term1(double r,double ep){ double sq; sq = sqrt(r * r + ep * ep); return log(sq + ep) - ep * (sq + 2.0 * ep) / (sq + ep) / sq; } double term2(double r,double ep){ double sq; sq = sqrt(r * r + ep * ep); return (sq + 2.0 * ep) / (sq + ep) / (sq + ep) / sq; } }}}