Actual source code: vpscat.c

  1: /*$Id: vpscat.c,v 1.165 2001/09/08 13:16:48 bsmith Exp $*/
  2: /*
  3:     Defines parallel vector scatters.
  4: */

 6:  #include src/vec/is/isimpl.h
 7:  #include src/vec/vecimpl.h
 8:  #include src/vec/impls/dvecimpl.h
 9:  #include src/vec/impls/mpi/pvecimpl.h
 10:  #include petscsys.h

 12: int VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 13: {
 14:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 15:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 16:   int                    i,rank,ierr;
 17:   PetscViewerFormat      format;
 18:   PetscTruth             isascii;

 21:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);
 22:   if (isascii) {
 23:     MPI_Comm_rank(ctx->comm,&rank);
 24:     PetscViewerGetFormat(viewer,&format);
 25:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 26:       int nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 28:       MPI_Reduce(&to->n,&nsend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 29:       MPI_Reduce(&from->n,&nrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 30:       itmp = to->starts[to->n+1];
 31:       MPI_Reduce(&itmp,&lensend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 32:       itmp = from->starts[from->n+1];
 33:       MPI_Reduce(&itmp,&lenrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 34:       MPI_Reduce(&itmp,&alldata,1,MPI_INT,MPI_SUM,0,ctx->comm);

 36:       PetscViewerASCIIPrintf(viewer,"VecScatter statisticsn");
 37:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %dn",nsend_max);
 38:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %dn",nrecv_max);
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %dn",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 40:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %dn",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 41:       PetscViewerASCIIPrintf(viewer,"  Total data sent %dn",(int)(alldata*to->bs*sizeof(PetscScalar)));

 43:     } else {
 44:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %d; Number to self = %dn",rank,to->n,to->local.n);
 45:       if (to->n) {
 46:         for (i=0; i<to->n; i++){
 47:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %d length = %d to whom %dn",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 48:         }
 49:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)n");
 50:         for (i=0; i<to->starts[to->n]; i++){
 51:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d n",rank,to->indices[i]);
 52:         }
 53:       }

 55:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %d; Number from self = %dn",rank,from->n,from->local.n);
 56:       if (from->n) {
 57:         for (i=0; i<from->n; i++){
 58:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d length %d from whom %dn",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 59:         }

 61:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)n");
 62:         for (i=0; i<from->starts[from->n]; i++){
 63:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d n",rank,from->indices[i]);
 64:         }
 65:       }
 66:       if (to->local.n) {
 67:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scattern",rank);
 68:         for (i=0; i<to->local.n; i++){
 69:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %d to %d n",rank,from->local.slots[i],to->local.slots[i]);
 70:         }
 71:       }

 73:       PetscViewerFlush(viewer);
 74:     }
 75:   } else {
 76:     SETERRQ1(1,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 77:   }
 78:   return(0);
 79: }

 81: /* -----------------------------------------------------------------------------------*/
 82: /*
 83:       The next routine determines what part of  the local part of the scatter is an
 84:   exact copy of values into their current location. We check this here and
 85:   then know that we need not perform that portion of the scatter.
 86: */
 87: int VecScatterLocalOptimize_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from)
 88: {
 89:   int n = gen_to->n,n_nonmatching = 0,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
 90:   int *nto_slots,*nfrom_slots,j = 0,ierr;
 91: 
 93:   for (i=0; i<n; i++) {
 94:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
 95:   }

 97:   if (!n_nonmatching) {
 98:     gen_to->nonmatching_computed = PETSC_TRUE;
 99:     gen_to->n_nonmatching        = gen_from->n_nonmatching = 0;
100:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to 0n", n);
101:   } else if (n_nonmatching == n) {
102:     gen_to->nonmatching_computed = PETSC_FALSE;
103:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:All values non-matchingn");
104:   } else {
105:     gen_to->nonmatching_computed= PETSC_TRUE;
106:     gen_to->n_nonmatching       = gen_from->n_nonmatching = n_nonmatching;
107:     PetscMalloc(n_nonmatching*sizeof(int),&nto_slots);
108:     gen_to->slots_nonmatching   = nto_slots;
109:     PetscMalloc(n_nonmatching*sizeof(int),&nfrom_slots);
110:     gen_from->slots_nonmatching = nfrom_slots;
111:     for (i=0; i<n; i++) {
112:       if (to_slots[i] != from_slots[i]) {
113:         nto_slots[j]   = to_slots[i];
114:         nfrom_slots[j] = from_slots[i];
115:         j++;
116:       }
117:     }
118:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to %dn",n,n_nonmatching);
119:   }
120:   return(0);
121: }

123: /* --------------------------------------------------------------------------------------*/
124: int VecScatterCopy_PtoP(VecScatter in,VecScatter out)
125: {
126:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
127:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
128:   int                    len,ny,ierr;

131:   out->postrecvs = in->postrecvs;
132:   out->begin     = in->begin;
133:   out->end       = in->end;
134:   out->copy      = in->copy;
135:   out->destroy   = in->destroy;
136:   out->view      = in->view;

138:   /* allocate entire send scatter context */
139:   PetscNew(VecScatter_MPI_General,&out_to);
140:   PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
141:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
142:   ny                = in_to->starts[in_to->n];
143:   len               = ny*(sizeof(int) + sizeof(PetscScalar)) + (in_to->n+1)*sizeof(int) +
144:                      (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
145:   out_to->n         = in_to->n;
146:   out_to->type      = in_to->type;
147:   out_to->sendfirst = in_to->sendfirst;
148:   PetscMalloc(len,&out_to->values);
149:   PetscLogObjectMemory(out,len);
150:   out_to->requests  = (MPI_Request*)(out_to->values + ny);
151:   out_to->indices   = (int*)(out_to->requests + out_to->n);
152:   out_to->starts    = (int*)(out_to->indices + ny);
153:   out_to->procs     = (int*)(out_to->starts + out_to->n + 1);
154:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
155:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
156:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
157:   PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
158:   out_to->rstatus = out_to->rstatus + PetscMax(in_to->n,in_from->n) + 1;
159:   PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n) + 1)*sizeof(MPI_Status));
160:   out->todata      = (void*)out_to;
161:   out_to->local.n  = in_to->local.n;
162:   out_to->local.nonmatching_computed = PETSC_FALSE;
163:   out_to->local.n_nonmatching        = 0;
164:   out_to->local.slots_nonmatching    = 0;
165:   if (in_to->local.n) {
166:     PetscMalloc(in_to->local.n*sizeof(int),&out_to->local.slots);
167:     PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
168:     PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
169:   } else {
170:     out_to->local.slots = 0;
171:   }

173:   /* allocate entire receive context */
174:   PetscNew(VecScatter_MPI_General,&out_from);
175:   PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
176:   out_from->type      = in_from->type;
177:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
178:   ny                  = in_from->starts[in_from->n];
179:   len                 = ny*(sizeof(int) + sizeof(PetscScalar)) + (in_from->n+1)*sizeof(int) +
180:                        (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
181:   out_from->n         = in_from->n;
182:   out_from->sendfirst = in_from->sendfirst;
183:   PetscMalloc(len,&out_from->values);
184:   PetscLogObjectMemory(out,len);
185:   out_from->requests  = (MPI_Request*)(out_from->values + ny);
186:   out_from->indices   = (int*)(out_from->requests + out_from->n);
187:   out_from->starts    = (int*)(out_from->indices + ny);
188:   out_from->procs     = (int*)(out_from->starts + out_from->n + 1);
189:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
190:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
191:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
192:   out->fromdata       = (void*)out_from;
193:   out_from->local.n   = in_from->local.n;
194:   out_from->local.nonmatching_computed = PETSC_FALSE;
195:   out_from->local.n_nonmatching        = 0;
196:   out_from->local.slots_nonmatching    = 0;
197:   if (in_from->local.n) {
198:     PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
199:     PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
200:     PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
201:   } else {
202:     out_from->local.slots = 0;
203:   }
204:   return(0);
205: }

207: /* -------------------------------------------------------------------------------------*/
208: int VecScatterDestroy_PtoP(VecScatter ctx)
209: {
210:   VecScatter_MPI_General *gen_to   = (VecScatter_MPI_General*)ctx->todata;
211:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;

215:   if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
216:   if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
217:   if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
218:   if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}
219:   PetscFree(gen_to->sstatus);
220:   PetscFree(gen_to->values);
221:   PetscFree(gen_to);
222:   PetscFree(gen_from->values);
223:   PetscFree(gen_from);
224:   PetscLogObjectDestroy(ctx);
225:   PetscHeaderDestroy(ctx);
226:   return(0);
227: }

229: /* --------------------------------------------------------------------------------------*/
230: /*
231:      Even though the next routines are written with parallel 
232:   vectors, either xin or yin (but not both) may be Seq
233:   vectors, one for each processor.
234:   
235:      gen_from indices indicate where arriving stuff is stashed
236:      gen_to   indices indicate where departing stuff came from. 
237:      the naming can be VERY confusing.

239: */
240: int VecScatterBegin_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
241: {
242:   VecScatter_MPI_General *gen_to,*gen_from;
243:   MPI_Comm               comm = ctx->comm;
244:   PetscScalar            *xv,*yv,*val,*rvalues,*svalues;
245:   MPI_Request            *rwaits,*swaits;
246:   int                    tag = ctx->tag,i,j,*indices,*rstarts,*sstarts,*rprocs,*sprocs;
247:   int                    nrecvs,nsends,iend,ierr;

250:   VecGetArray(xin,&xv);
251:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
252:   if (mode & SCATTER_REVERSE){
253:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
254:     gen_from = (VecScatter_MPI_General*)ctx->todata;
255:   } else {
256:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
257:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
258:   }
259:   rvalues  = gen_from->values;
260:   svalues  = gen_to->values;
261:   nrecvs   = gen_from->n;
262:   nsends   = gen_to->n;
263:   rwaits   = gen_from->requests;
264:   swaits   = gen_to->requests;
265:   indices  = gen_to->indices;
266:   rstarts  = gen_from->starts;
267:   sstarts  = gen_to->starts;
268:   rprocs   = gen_from->procs;
269:   sprocs   = gen_to->procs;

271:   if (!(mode & SCATTER_LOCAL)) {

273:     if (gen_to->sendfirst) {
274:       /* do sends:  */
275:       for (i=0; i<nsends; i++) {
276:         val  = svalues + sstarts[i];
277:         iend = sstarts[i+1]-sstarts[i];
278:         /* pack the message */
279:         for (j=0; j<iend; j++) {
280:           val[j] = xv[*indices++];
281:         }
282:         MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
283:       }
284:     }
285: 
286:     /* post receives:   */
287:     for (i=0; i<nrecvs; i++) {
288:       MPI_Irecv(rvalues+rstarts[i],rstarts[i+1]-rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
289:     }

291:     if (!gen_to->sendfirst) {
292:       /* do sends:  */
293:       for (i=0; i<nsends; i++) {
294:         val  = svalues + sstarts[i];
295:         iend = sstarts[i+1]-sstarts[i];
296:         /* pack the message */
297:         for (j=0; j<iend; j++) {
298:           val[j] = xv[*indices++];
299:         }
300:         MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
301:       }
302:     }
303:   }

305:   /* take care of local scatters */
306:   if (gen_to->local.n && addv == INSERT_VALUES) {
307:     if (yv == xv && !gen_to->local.nonmatching_computed) {
308:       VecScatterLocalOptimize_Private(&gen_to->local,&gen_from->local);
309:     }
310:     if (gen_to->local.is_copy) {
311:       PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
312:     } else if (yv != xv || gen_to->local.nonmatching_computed == -1) {
313:       int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
314:       int n       = gen_to->local.n;
315:       for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
316:     } else {
317:       /* 
318:         In this case, it is copying the values into their old locations, thus we can skip those  
319:       */
320:       int *tslots = gen_to->local.slots_nonmatching,*fslots = gen_from->local.slots_nonmatching;
321:       int n       = gen_to->local.n_nonmatching;
322:       for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
323:     }
324:   } else if (gen_to->local.n) {
325:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
326:     int n = gen_to->local.n;
327:     if (addv == ADD_VALUES) {
328:       for (i=0; i<n; i++) {yv[fslots[i]] += xv[tslots[i]];}
329: #if !defined(PETSC_USE_COMPLEX)
330:     } else if (addv == MAX_VALUES) {
331:       for (i=0; i<n; i++) {yv[fslots[i]] = PetscMax(yv[fslots[i]],xv[tslots[i]]);}
332: #endif
333:     } else {SETERRQ(1,"Wrong insert option");}
334:   }

336:   VecRestoreArray(xin,&xv);
337:   if (xin != yin) {VecRestoreArray(yin,&yv);}
338:   return(0);
339: }

341: /* --------------------------------------------------------------------------------------*/
342: int VecScatterEnd_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
343: {
344:   VecScatter_MPI_General *gen_to,*gen_from;
345:   PetscScalar            *rvalues,*yv,*val;
346:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices;
347:   MPI_Request            *rwaits,*swaits;
348:   MPI_Status             rstatus,*sstatus;

351:   if (mode & SCATTER_LOCAL) return(0);
352:   VecGetArray(yin,&yv);

354:   if (mode & SCATTER_REVERSE){
355:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
356:     gen_from = (VecScatter_MPI_General*)ctx->todata;
357:     sstatus  = gen_from->sstatus;
358:   } else {
359:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
360:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
361:     sstatus  = gen_to->sstatus;
362:   }
363:   rvalues  = gen_from->values;
364:   nrecvs   = gen_from->n;
365:   nsends   = gen_to->n;
366:   rwaits   = gen_from->requests;
367:   swaits   = gen_to->requests;
368:   indices  = gen_from->indices;
369:   rstarts  = gen_from->starts;

371:   /*  wait on receives */
372:   count = nrecvs;
373:   while (count) {
374:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
375:     /* unpack receives into our local space */
376:     val      = rvalues + rstarts[imdex];
377:     n        = rstarts[imdex+1]-rstarts[imdex];
378:     lindices = indices + rstarts[imdex];
379:     if (addv == INSERT_VALUES) {
380:       for (i=0; i<n; i++) {
381:         yv[lindices[i]] = *val++;
382:       }
383:     } else if (addv == ADD_VALUES) {
384:       for (i=0; i<n; i++) {
385:         yv[lindices[i]] += *val++;
386:       }
387: #if !defined(PETSC_USE_COMPLEX)
388:     } else if (addv == MAX_VALUES) {
389:       for (i=0; i<n; i++) {
390:         yv[lindices[i]] = PetscMax(yv[lindices[i]],*val); val++;
391:       }
392: #endif
393:     }  else {SETERRQ(1,"Wrong insert option");}
394:     count--;
395:   }

397:   /* wait on sends */
398:   if (nsends) {
399:     MPI_Waitall(nsends,swaits,sstatus);
400:   }
401:   VecRestoreArray(yin,&yv);
402:   return(0);
403: }
404: /* ==========================================================================================*/
405: /*
406:     Special scatters for fixed block sizes. These provide better performance
407:     because the local copying and packing and unpacking are done with loop 
408:     unrolling to the size of the block.

410:     Also uses MPI persistent sends and receives, these (at least in theory)
411:     allow MPI to optimize repeated sends and receives of the same type.
412: */

414: /*
415:     This is for use with the "ready-receiver" mode. In theory on some
416:     machines it could lead to better performance. In practice we've never
417:     seen it give better performance. Accessed with the -vecscatter_rr flag.
418: */
419: int VecScatterPostRecvs_PtoP_X(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
420: {
421:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;

424:   MPI_Startall_irecv(gen_from->starts[gen_from->n]*gen_from->bs,gen_from->n,gen_from->requests);
425:   return(0);
426: }

428: /* --------------------------------------------------------------------------------------*/
429: /*
430:     Special optimization to see if the local part of the scatter is actually 
431:     a copy. The scatter routines call PetscMemcpy() instead.
432:  
433: */
434: int VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from,int bs)
435: {
436:   int n = gen_to->n,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
437:   int to_start,from_start;
438: 
440:   to_start   = to_slots[0];
441:   from_start = from_slots[0];

443:   for (i=1; i<n; i++) {
444:     to_start   += bs;
445:     from_start += bs;
446:     if (to_slots[i]   != to_start)   return(0);
447:     if (from_slots[i] != from_start) return(0);
448:   }
449:   gen_to->is_copy       = PETSC_TRUE;
450:   gen_to->copy_start    = to_slots[0];
451:   gen_to->copy_length   = bs*sizeof(PetscScalar)*n;
452:   gen_from->is_copy     = PETSC_TRUE;
453:   gen_from->copy_start  = from_slots[0];
454:   gen_from->copy_length = bs*sizeof(PetscScalar)*n;

456:   PetscLogInfo(0,"VecScatterLocalOptimizeCopy_Private:Local scatter is a copy, optimizing for itn");

458:   return(0);
459: }

461: /* --------------------------------------------------------------------------------------*/

463: int VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
464: {
465:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
466:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
467:   int                    len,ny,bs = in_from->bs,ierr;

470:   out->postrecvs = in->postrecvs;
471:   out->begin     = in->begin;
472:   out->end       = in->end;
473:   out->copy      = in->copy;
474:   out->destroy   = in->destroy;
475:   out->view      = in->view;

477:   /* allocate entire send scatter context */
478:   PetscNew(VecScatter_MPI_General,&out_to);
479:   PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
480:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
481:   ny                = in_to->starts[in_to->n];
482:   len               = ny*(sizeof(int) + bs*sizeof(PetscScalar)) + (in_to->n+1)*sizeof(int) +
483:                      (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
484:   out_to->n         = in_to->n;
485:   out_to->type      = in_to->type;
486:   out_to->sendfirst = in_to->sendfirst;

488:   PetscMalloc(len,&out_to->values);
489:   PetscLogObjectMemory(out,len);
490:   out_to->requests  = (MPI_Request*)(out_to->values + bs*ny);
491:   out_to->indices   = (int*)(out_to->requests + out_to->n);
492:   out_to->starts    = (int*)(out_to->indices + ny);
493:   out_to->procs     = (int*)(out_to->starts + out_to->n + 1);
494:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
495:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
496:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
497:   PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
498:   out_to->rstatus   =  out_to->sstatus + PetscMax(in_to->n,in_from->n) + 1;
499: 
500:   PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status));
501:   out->todata       = (void*)out_to;
502:   out_to->local.n   = in_to->local.n;
503:   out_to->local.nonmatching_computed = PETSC_FALSE;
504:   out_to->local.n_nonmatching        = 0;
505:   out_to->local.slots_nonmatching    = 0;
506:   if (in_to->local.n) {
507:     PetscMalloc(in_to->local.n*sizeof(int),out_to->local.slots);
508:     PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
509:     PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
510:   } else {
511:     out_to->local.slots = 0;
512:   }

514:   /* allocate entire receive context */
515:   PetscNew(VecScatter_MPI_General,&out_from);
516:   PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
517:   out_from->type      = in_from->type;
518:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
519:   ny                  = in_from->starts[in_from->n];
520:   len                 = ny*(sizeof(int) + bs*sizeof(PetscScalar)) + (in_from->n+1)*sizeof(int) +
521:                        (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
522:   out_from->n         = in_from->n;
523:   out_from->sendfirst = in_from->sendfirst;
524:   PetscMalloc(len,&out_from->values);
525:   PetscLogObjectMemory(out,len);
526:   out_from->requests  = (MPI_Request*)(out_from->values + bs*ny);
527:   out_from->indices   = (int*)(out_from->requests + out_from->n);
528:   out_from->starts    = (int*)(out_from->indices + ny);
529:   out_from->procs     = (int*)(out_from->starts + out_from->n + 1);
530:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
531:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
532:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
533:   out->fromdata       = (void*)out_from;
534:   out_from->local.n   = in_from->local.n;
535:   out_from->local.nonmatching_computed = PETSC_FALSE;
536:   out_from->local.n_nonmatching        = 0;
537:   out_from->local.slots_nonmatching    = 0;
538:   if (in_from->local.n) {
539:     PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
540:     PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
541:     PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
542:   } else {
543:     out_from->local.slots = 0;
544:   }

546:   /* 
547:       set up the request arrays for use with isend_init() and irecv_init()
548:   */
549:   {
550:     MPI_Comm    comm;
551:     int         *sstarts = out_to->starts,  *rstarts = out_from->starts;
552:     int         *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
553:     int         tag,i;
554:     PetscTruth  flg;
555:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
556:     MPI_Request *rev_swaits,*rev_rwaits;
557:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

559:     PetscMalloc((in_to->n+1)*sizeof(MPI_Request),&out_to->rev_requests);
560:     PetscMalloc((in_from->n+1)*sizeof(MPI_Request),&out_from->rev_requests);
561:     PetscLogObjectMemory(out,(in_to->n+in_from->n+2)*sizeof(MPI_Request));

563:     rev_rwaits = out_to->rev_requests;
564:     rev_swaits = out_from->rev_requests;

566:     out_from->bs = out_to->bs = bs;
567:     tag     = out->tag;
568:     comm    = out->comm;

570:     /* Register the receives that you will use later (sends for scatter reverse) */
571:     for (i=0; i<out_from->n; i++) {
572:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
573:                            comm,rwaits+i);
574:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
575:                            comm,rev_swaits+i);
576:     }

578:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
579:     if (flg) {
580:       out->postrecvs               = VecScatterPostRecvs_PtoP_X;
581:       out_to->use_readyreceiver    = PETSC_TRUE;
582:       out_from->use_readyreceiver  = PETSC_TRUE;
583:       for (i=0; i<out_to->n; i++) {
584:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
585:                               comm,swaits+i);
586:       }
587:       PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter ready receiver moden");
588:     } else {
589:       out->postrecvs               = 0;
590:       out_to->use_readyreceiver    = PETSC_FALSE;
591:       out_from->use_readyreceiver  = PETSC_FALSE;
592:       flg                          = PETSC_FALSE;
593:       ierr                         = PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
594:       if (flg) {
595:         PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter Ssend moden");
596:       }
597:       for (i=0; i<out_to->n; i++) {
598:         if (!flg) {
599:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
600:                                comm,swaits+i);
601:         } else {
602:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
603:                                comm,swaits+i);
604:         }
605:       }
606:     }
607:     /* Register receives for scatter reverse */
608:     for (i=0; i<out_to->n; i++) {
609:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
610:                            comm,rev_rwaits+i);
611:     }
612:   }

614:   return(0);
615: }

617: /* --------------------------------------------------------------------------------------*/

619: int VecScatterBegin_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
620: {
621:   VecScatter_MPI_General *gen_to,*gen_from;
622:   PetscScalar            *xv,*yv,*val,*svalues;
623:   MPI_Request            *rwaits,*swaits;
624:   int                    *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;

627:   VecGetArray(xin,&xv);
628:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

630:   if (mode & SCATTER_REVERSE) {
631:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
632:     gen_from = (VecScatter_MPI_General*)ctx->todata;
633:     rwaits   = gen_from->rev_requests;
634:     swaits   = gen_to->rev_requests;
635:   } else {
636:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
637:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
638:     rwaits   = gen_from->requests;
639:     swaits   = gen_to->requests;
640:   }
641:   svalues  = gen_to->values;
642:   nrecvs   = gen_from->n;
643:   nsends   = gen_to->n;
644:   indices  = gen_to->indices;
645:   sstarts  = gen_to->starts;

647:   if (!(mode & SCATTER_LOCAL)) {

649:     if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
650:       /* post receives since they were not posted in VecScatterPostRecvs()   */
651:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
652:     }
653:     if (ctx->packtogether) {
654:       /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
655:       len  = 12*sstarts[nsends];
656:       val  = svalues;
657:       for (i=0; i<len; i += 12) {
658:         idx     = *indices++;
659:         val[0]  = xv[idx];
660:         val[1]  = xv[idx+1];
661:         val[2]  = xv[idx+2];
662:         val[3]  = xv[idx+3];
663:         val[4]  = xv[idx+4];
664:         val[5]  = xv[idx+5];
665:         val[6]  = xv[idx+6];
666:         val[7]  = xv[idx+7];
667:         val[8]  = xv[idx+8];
668:         val[9]  = xv[idx+9];
669:         val[10] = xv[idx+10];
670:         val[11] = xv[idx+11];
671:         val    += 12;
672:       }
673:       MPI_Startall_isend(len,nsends,swaits);
674:     } else {
675:       /* this version packs and sends one at a time */
676:       val  = svalues;
677:       for (i=0; i<nsends; i++) {
678:         iend = sstarts[i+1]-sstarts[i];

680:         for (j=0; j<iend; j++) {
681:           idx     = *indices++;
682:           val[0]  = xv[idx];
683:           val[1]  = xv[idx+1];
684:           val[2]  = xv[idx+2];
685:           val[3]  = xv[idx+3];
686:           val[4]  = xv[idx+4];
687:           val[5]  = xv[idx+5];
688:           val[6]  = xv[idx+6];
689:           val[7]  = xv[idx+7];
690:           val[8]  = xv[idx+8];
691:           val[9]  = xv[idx+9];
692:           val[10] = xv[idx+10];
693:           val[11] = xv[idx+11];
694:           val    += 12;
695:         }
696:         MPI_Start_isend(12*iend,swaits+i);
697:       }
698:     }

700:     if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
701:       /* post receives since they were not posted in VecScatterPostRecvs()   */
702:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
703:     }
704:   }

706:   /* take care of local scatters */
707:   if (gen_to->local.n) {
708:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
709:     int n       = gen_to->local.n,il,ir;
710:     if (addv == INSERT_VALUES) {
711:       if (gen_to->local.is_copy) {
712:         PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
713:       } else {
714:         for (i=0; i<n; i++) {
715:           il = fslots[i]; ir = tslots[i];
716:           yv[il]    = xv[ir];
717:           yv[il+1]  = xv[ir+1];
718:           yv[il+2]  = xv[ir+2];
719:           yv[il+3]  = xv[ir+3];
720:           yv[il+4]  = xv[ir+4];
721:           yv[il+5]  = xv[ir+5];
722:           yv[il+6]  = xv[ir+6];
723:           yv[il+7]  = xv[ir+7];
724:           yv[il+8]  = xv[ir+8];
725:           yv[il+9]  = xv[ir+9];
726:           yv[il+10] = xv[ir+10];
727:           yv[il+11] = xv[ir+11];
728:         }
729:       }
730:     }  else if (addv == ADD_VALUES) {
731:       for (i=0; i<n; i++) {
732:         il = fslots[i]; ir = tslots[i];
733:         yv[il]    += xv[ir];
734:         yv[il+1]  += xv[ir+1];
735:         yv[il+2]  += xv[ir+2];
736:         yv[il+3]  += xv[ir+3];
737:         yv[il+4]  += xv[ir+4];
738:         yv[il+5]  += xv[ir+5];
739:         yv[il+6]  += xv[ir+6];
740:         yv[il+7]  += xv[ir+7];
741:         yv[il+8]  += xv[ir+8];
742:         yv[il+9]  += xv[ir+9];
743:         yv[il+10] += xv[ir+10];
744:         yv[il+11] += xv[ir+11];
745:       }
746: #if !defined(PETSC_USE_COMPLEX)
747:     }  else if (addv == MAX_VALUES) {
748:       for (i=0; i<n; i++) {
749:         il = fslots[i]; ir = tslots[i];
750:         yv[il]    = PetscMax(yv[il],xv[ir]);
751:         yv[il+1]  = PetscMax(yv[il+1],xv[ir+1]);
752:         yv[il+2]  = PetscMax(yv[il+2],xv[ir+2]);
753:         yv[il+3]  = PetscMax(yv[il+3],xv[ir+3]);
754:         yv[il+4]  = PetscMax(yv[il+4],xv[ir+4]);
755:         yv[il+5]  = PetscMax(yv[il+5],xv[ir+5]);
756:         yv[il+6]  = PetscMax(yv[il+6],xv[ir+6]);
757:         yv[il+7]  = PetscMax(yv[il+7],xv[ir+7]);
758:         yv[il+8]  = PetscMax(yv[il+8],xv[ir+8]);
759:         yv[il+9]  = PetscMax(yv[il+9],xv[ir+9]);
760:         yv[il+10] = PetscMax(yv[il+10],xv[ir+10]);
761:         yv[il+11] = PetscMax(yv[il+11],xv[ir+11]);
762:       }
763: #endif
764:     } else {SETERRQ(1,"Wrong insert option");}
765:   }
766:   VecRestoreArray(xin,&xv);
767:   if (xin != yin) {VecRestoreArray(yin,&yv);}
768:   return(0);
769: }

771: /* --------------------------------------------------------------------------------------*/

773: int VecScatterEnd_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
774: {
775:   VecScatter_MPI_General *gen_to,*gen_from;
776:   PetscScalar            *rvalues,*yv,*val;
777:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
778:   MPI_Request            *rwaits,*swaits;
779:   MPI_Status             *rstatus,*sstatus;

782:   if (mode & SCATTER_LOCAL) return(0);
783:   VecGetArray(yin,&yv);

785:   if (mode & SCATTER_REVERSE) {
786:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
787:     gen_from = (VecScatter_MPI_General*)ctx->todata;
788:     rwaits   = gen_from->rev_requests;
789:     swaits   = gen_to->rev_requests;
790:     sstatus  = gen_from->sstatus;
791:     rstatus  = gen_from->rstatus;
792:   } else {
793:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
794:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
795:     rwaits   = gen_from->requests;
796:     swaits   = gen_to->requests;
797:     sstatus  = gen_to->sstatus;
798:     rstatus  = gen_to->rstatus;
799:   }
800:   rvalues  = gen_from->values;
801:   nrecvs   = gen_from->n;
802:   nsends   = gen_to->n;
803:   indices  = gen_from->indices;
804:   rstarts  = gen_from->starts;

806:   /*  wait on receives */
807:   count = nrecvs;
808:   if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
809:     ierr     = MPI_Waitall(nrecvs,rwaits,rstatus);
810:     n        = rstarts[count];
811:     val      = rvalues;
812:     lindices = indices;
813:     if (addv == INSERT_VALUES) {
814:       for (i=0; i<n; i++) {
815:         idx        = lindices[i];
816:         yv[idx]    = val[0];
817:         yv[idx+1]  = val[1];
818:         yv[idx+2]  = val[2];
819:         yv[idx+3]  = val[3];
820:         yv[idx+4]  = val[4];
821:         yv[idx+5]  = val[5];
822:         yv[idx+6]  = val[6];
823:         yv[idx+7]  = val[7];
824:         yv[idx+8]  = val[8];
825:         yv[idx+9]  = val[9];
826:         yv[idx+10] = val[10];
827:         yv[idx+11] = val[11];
828:         val       += 12;
829:       }
830:     } else if (addv == ADD_VALUES) {
831:       for (i=0; i<n; i++) {
832:         idx         = lindices[i];
833:         yv[idx]    += val[0];
834:         yv[idx+1]  += val[1];
835:         yv[idx+2]  += val[2];
836:         yv[idx+3]  += val[3];
837:         yv[idx+4]  += val[4];
838:         yv[idx+5]  += val[5];
839:         yv[idx+6]  += val[6];
840:         yv[idx+7]  += val[7];
841:         yv[idx+8]  += val[8];
842:         yv[idx+9]  += val[9];
843:         yv[idx+10] += val[10];
844:         yv[idx+11] += val[11];
845:         val        += 12;
846:       }
847: #if !defined(PETSC_USE_COMPLEX)
848:     } else if (addv == MAX_VALUES) {
849:       for (i=0; i<n; i++) {
850:         idx        = lindices[i];
851:         yv[idx]    = PetscMax(yv[idx],val[0]);
852:         yv[idx+1]  = PetscMax(yv[idx+1],val[1]);
853:         yv[idx+2]  = PetscMax(yv[idx+2],val[2]);
854:         yv[idx+3]  = PetscMax(yv[idx+3],val[3]);
855:         yv[idx+4]  = PetscMax(yv[idx+4],val[4]);
856:         yv[idx+5]  = PetscMax(yv[idx+5],val[5]);
857:         yv[idx+6]  = PetscMax(yv[idx+6],val[6]);
858:         yv[idx+7]  = PetscMax(yv[idx+7],val[7]);
859:         yv[idx+8]  = PetscMax(yv[idx+8],val[8]);
860:         yv[idx+9]  = PetscMax(yv[idx+9],val[9]);
861:         yv[idx+10] = PetscMax(yv[idx+10],val[10]);
862:         yv[idx+11] = PetscMax(yv[idx+11],val[11]);
863:         val       += 12;
864:       }
865: #endif
866:     }  else {SETERRQ(1,"Wrong insert option");}
867:   } else { /* unpack each message as it arrives, default version */
868:     while (count) {
869:       MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
870:       /* unpack receives into our local space */
871:       val      = rvalues + 12*rstarts[imdex];
872:       lindices = indices + rstarts[imdex];
873:       n        = rstarts[imdex+1] - rstarts[imdex];
874:       if (addv == INSERT_VALUES) {
875:         for (i=0; i<n; i++) {
876:           idx        = lindices[i];
877:           yv[idx]    = val[0];
878:           yv[idx+1]  = val[1];
879:           yv[idx+2]  = val[2];
880:           yv[idx+3]  = val[3];
881:           yv[idx+4]  = val[4];
882:           yv[idx+5]  = val[5];
883:           yv[idx+6]  = val[6];
884:           yv[idx+7]  = val[7];
885:           yv[idx+8]  = val[8];
886:           yv[idx+9]  = val[9];
887:           yv[idx+10] = val[10];
888:           yv[idx+11] = val[11];
889:           val       += 12;
890:       }
891:     } else if (addv == ADD_VALUES) {
892:       for (i=0; i<n; i++) {
893:         idx        = lindices[i];
894:         yv[idx]    += val[0];
895:         yv[idx+1]  += val[1];
896:         yv[idx+2]  += val[2];
897:         yv[idx+3]  += val[3];
898:         yv[idx+4]  += val[4];
899:         yv[idx+5]  += val[5];
900:         yv[idx+6]  += val[6];
901:         yv[idx+7]  += val[7];
902:         yv[idx+8]  += val[8];
903:         yv[idx+9]  += val[9];
904:         yv[idx+10] += val[10];
905:         yv[idx+11] += val[11];
906:         val        += 12;
907:       }
908: #if !defined(PETSC_USE_COMPLEX)
909:     } else if (addv == MAX_VALUES) {
910:       for (i=0; i<n; i++) {
911:         idx        = lindices[i];
912:         yv[idx]    = PetscMax(yv[idx],val[0]);
913:         yv[idx+1]  = PetscMax(yv[idx+1],val[1]);
914:         yv[idx+2]  = PetscMax(yv[idx+2],val[2]);
915:         yv[idx+3]  = PetscMax(yv[idx+3],val[3]);
916:         yv[idx+4]  = PetscMax(yv[idx+4],val[4]);
917:         yv[idx+5]  = PetscMax(yv[idx+5],val[5]);
918:         yv[idx+6]  = PetscMax(yv[idx+6],val[6]);
919:         yv[idx+7]  = PetscMax(yv[idx+7],val[7]);
920:         yv[idx+8]  = PetscMax(yv[idx+8],val[8]);
921:         yv[idx+9]  = PetscMax(yv[idx+9],val[9]);
922:         yv[idx+10] = PetscMax(yv[idx+10],val[10]);
923:         yv[idx+11] = PetscMax(yv[idx+11],val[11]);
924:         val        += 12;
925:       }
926: #endif
927:     }  else {SETERRQ(1,"Wrong insert option");}
928:     count--;
929:     }
930:   }
931:   /* wait on sends */
932:   if (nsends) {
933:     MPI_Waitall(nsends,swaits,sstatus);
934:   }
935:   VecRestoreArray(yin,&yv);
936:   return(0);
937: }

939: /* --------------------------------------------------------------------------------------*/

941: int VecScatterBegin_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
942: {
943:   VecScatter_MPI_General *gen_to,*gen_from;
944:   PetscScalar            *xv,*yv,*val,*svalues;
945:   MPI_Request            *rwaits,*swaits;
946:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

949:   VecGetArray(xin,&xv);
950:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
951:   if (mode & SCATTER_REVERSE) {
952:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
953:     gen_from = (VecScatter_MPI_General*)ctx->todata;
954:     rwaits   = gen_from->rev_requests;
955:     swaits   = gen_to->rev_requests;
956:   } else {
957:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
958:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
959:     rwaits   = gen_from->requests;
960:     swaits   = gen_to->requests;
961:   }
962:   svalues  = gen_to->values;
963:   nrecvs   = gen_from->n;
964:   nsends   = gen_to->n;
965:   indices  = gen_to->indices;
966:   sstarts  = gen_to->starts;

968:   if (!(mode & SCATTER_LOCAL)) {

970:     if (gen_to->sendfirst) {
971:       /* this version packs and sends one at a time */
972:       val  = svalues;
973:       for (i=0; i<nsends; i++) {
974:         iend = sstarts[i+1]-sstarts[i];

976:         for (j=0; j<iend; j++) {
977:           idx     = *indices++;
978:           val[0] = xv[idx];
979:           val[1] = xv[idx+1];
980:           val[2] = xv[idx+2];
981:           val[3] = xv[idx+3];
982:           val[4] = xv[idx+4];
983:           val    += 5;
984:         }
985:         MPI_Start_isend(5*iend,swaits+i);
986:       }
987:     }

989:     if (!gen_from->use_readyreceiver) {
990:       /* post receives since they were not posted in VecScatterPostRecvs()   */
991:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
992:     }

994:     if (!gen_to->sendfirst) {
995:       /* this version packs all the messages together and sends */
996:       /*
997:       len  = 5*sstarts[nsends];
998:       val  = svalues;
999:       for (i=0; i<len; i += 5) {
1000:         idx     = *indices++;
1001:         val[0] = xv[idx];
1002:         val[1] = xv[idx+1];
1003:         val[2] = xv[idx+2];
1004:         val[3] = xv[idx+3];
1005:         val[4] = xv[idx+4];
1006:         val      += 5;
1007:       }
1008:       MPI_Startall_isend(len,nsends,swaits);
1009:       */

1011:       /* this version packs and sends one at a time */
1012:       val  = svalues;
1013:       for (i=0; i<nsends; i++) {
1014:         iend = sstarts[i+1]-sstarts[i];

1016:         for (j=0; j<iend; j++) {
1017:           idx     = *indices++;
1018:           val[0] = xv[idx];
1019:           val[1] = xv[idx+1];
1020:           val[2] = xv[idx+2];
1021:           val[3] = xv[idx+3];
1022:           val[4] = xv[idx+4];
1023:           val    += 5;
1024:         }
1025:         MPI_Start_isend(5*iend,swaits+i);
1026:       }
1027:     }
1028:   }

1030:   /* take care of local scatters */
1031:   if (gen_to->local.n) {
1032:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1033:     int n       = gen_to->local.n,il,ir;
1034:     if (addv == INSERT_VALUES) {
1035:       if (gen_to->local.is_copy) {
1036:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1037:       } else {
1038:         for (i=0; i<n; i++) {
1039:           il = fslots[i]; ir = tslots[i];
1040:           yv[il]   = xv[ir];
1041:           yv[il+1] = xv[ir+1];
1042:           yv[il+2] = xv[ir+2];
1043:           yv[il+3] = xv[ir+3];
1044:           yv[il+4] = xv[ir+4];
1045:         }
1046:       }
1047:     }  else if (addv == ADD_VALUES) {
1048:       for (i=0; i<n; i++) {
1049:         il = fslots[i]; ir = tslots[i];
1050:         yv[il]   += xv[ir];
1051:         yv[il+1] += xv[ir+1];
1052:         yv[il+2] += xv[ir+2];
1053:         yv[il+3] += xv[ir+3];
1054:         yv[il+4] += xv[ir+4];
1055:       }
1056: #if !defined(PETSC_USE_COMPLEX)
1057:     }  else if (addv == MAX_VALUES) {
1058:       for (i=0; i<n; i++) {
1059:         il = fslots[i]; ir = tslots[i];
1060:         yv[il]   = PetscMax(yv[il],xv[ir]);
1061:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1062:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1063:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1064:         yv[il+4] = PetscMax(yv[il+4],xv[ir+4]);
1065:       }
1066: #endif
1067:     }  else {SETERRQ(1,"Wrong insert option");}
1068:   }
1069:   VecRestoreArray(xin,&xv);
1070:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1071:   return(0);
1072: }

1074: /* --------------------------------------------------------------------------------------*/

1076: int VecScatterEnd_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1077: {
1078:   VecScatter_MPI_General *gen_to,*gen_from;
1079:   PetscScalar            *rvalues,*yv,*val;
1080:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1081:   MPI_Request            *rwaits,*swaits;
1082:   MPI_Status             rstatus,*sstatus;

1085:   if (mode & SCATTER_LOCAL) return(0);
1086:   VecGetArray(yin,&yv);

1088:   if (mode & SCATTER_REVERSE) {
1089:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1090:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1091:     rwaits   = gen_from->rev_requests;
1092:     swaits   = gen_to->rev_requests;
1093:     sstatus  = gen_from->sstatus;
1094:   } else {
1095:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1096:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1097:     rwaits   = gen_from->requests;
1098:     swaits   = gen_to->requests;
1099:     sstatus  = gen_to->sstatus;
1100:   }
1101:   rvalues  = gen_from->values;
1102:   nrecvs   = gen_from->n;
1103:   nsends   = gen_to->n;
1104:   indices  = gen_from->indices;
1105:   rstarts  = gen_from->starts;

1107:   /*  wait on receives */
1108:   count = nrecvs;
1109:   while (count) {
1110:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1111:     /* unpack receives into our local space */
1112:     val      = rvalues + 5*rstarts[imdex];
1113:     lindices = indices + rstarts[imdex];
1114:     n        = rstarts[imdex+1] - rstarts[imdex];
1115:     if (addv == INSERT_VALUES) {
1116:       for (i=0; i<n; i++) {
1117:         idx       = lindices[i];
1118:         yv[idx]   = val[0];
1119:         yv[idx+1] = val[1];
1120:         yv[idx+2] = val[2];
1121:         yv[idx+3] = val[3];
1122:         yv[idx+4] = val[4];
1123:         val      += 5;
1124:       }
1125:     } else if (addv == ADD_VALUES) {
1126:       for (i=0; i<n; i++) {
1127:         idx       = lindices[i];
1128:         yv[idx]   += val[0];
1129:         yv[idx+1] += val[1];
1130:         yv[idx+2] += val[2];
1131:         yv[idx+3] += val[3];
1132:         yv[idx+4] += val[4];
1133:         val       += 5;
1134:       }
1135: #if !defined(PETSC_USE_COMPLEX)
1136:     } else if (addv == MAX_VALUES) {
1137:       for (i=0; i<n; i++) {
1138:         idx       = lindices[i];
1139:         yv[idx]   = PetscMax(yv[idx],val[0]);
1140:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1141:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1142:         yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1143:         yv[idx+4] = PetscMax(yv[idx+4],val[4]);
1144:         val       += 5;
1145:       }
1146: #endif
1147:     }  else {SETERRQ(1,"Wrong insert option");}
1148:     count--;
1149:   }
1150:   /* wait on sends */
1151:   if (nsends) {
1152:     MPI_Waitall(nsends,swaits,sstatus);
1153:   }
1154:   VecRestoreArray(yin,&yv);
1155:   return(0);
1156: }

1158: /* --------------------------------------------------------------------------------------*/

1160: int VecScatterBegin_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1161: {
1162:   VecScatter_MPI_General *gen_to,*gen_from;
1163:   PetscScalar            *xv,*yv,*val,*svalues;
1164:   MPI_Request            *rwaits,*swaits;
1165:   int                    *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;

1168:   VecGetArray(xin,&xv);
1169:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

1171:   if (mode & SCATTER_REVERSE) {
1172:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1173:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1174:     rwaits   = gen_from->rev_requests;
1175:     swaits   = gen_to->rev_requests;
1176:   } else {
1177:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1178:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1179:     rwaits   = gen_from->requests;
1180:     swaits   = gen_to->requests;
1181:   }
1182:   svalues  = gen_to->values;
1183:   nrecvs   = gen_from->n;
1184:   nsends   = gen_to->n;
1185:   indices  = gen_to->indices;
1186:   sstarts  = gen_to->starts;

1188:   if (!(mode & SCATTER_LOCAL)) {

1190:     if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
1191:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1192:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1193:     }

1195:     if (ctx->packtogether) {
1196:       /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
1197:       len  = 4*sstarts[nsends];
1198:       val  = svalues;
1199:       for (i=0; i<len; i += 4) {
1200:         idx    = *indices++;
1201:         val[0] = xv[idx];
1202:         val[1] = xv[idx+1];
1203:         val[2] = xv[idx+2];
1204:         val[3] = xv[idx+3];
1205:         val    += 4;
1206:       }
1207:       MPI_Startall_isend(len,nsends,swaits);
1208:     } else {
1209:       /* this version packs and sends one at a time, default */
1210:       val  = svalues;
1211:       for (i=0; i<nsends; i++) {
1212:         iend = sstarts[i+1]-sstarts[i];

1214:         for (j=0; j<iend; j++) {
1215:           idx     = *indices++;
1216:           val[0] = xv[idx];
1217:           val[1] = xv[idx+1];
1218:           val[2] = xv[idx+2];
1219:           val[3] = xv[idx+3];
1220:           val    += 4;
1221:         }
1222:         MPI_Start_isend(4*iend,swaits+i);
1223:       }
1224:     }

1226:     if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
1227:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1228:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1229:     }
1230:   }

1232:   /* take care of local scatters */
1233:   if (gen_to->local.n) {
1234:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1235:     int n       = gen_to->local.n,il,ir;
1236:     if (addv == INSERT_VALUES) {
1237:       if (gen_to->local.is_copy) {
1238:         PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
1239:       } else {
1240:         for (i=0; i<n; i++) {
1241:           il = fslots[i]; ir = tslots[i];
1242:           yv[il]   = xv[ir];
1243:           yv[il+1] = xv[ir+1];
1244:           yv[il+2] = xv[ir+2];
1245:           yv[il+3] = xv[ir+3];
1246:         }
1247:       }
1248:     }  else if (addv == ADD_VALUES) {
1249:       for (i=0; i<n; i++) {
1250:         il = fslots[i]; ir = tslots[i];
1251:         yv[il]   += xv[ir];
1252:         yv[il+1] += xv[ir+1];
1253:         yv[il+2] += xv[ir+2];
1254:         yv[il+3] += xv[ir+3];
1255:       }
1256: #if !defined(PETSC_USE_COMPLEX)
1257:     }  else if (addv == MAX_VALUES) {
1258:       for (i=0; i<n; i++) {
1259:         il = fslots[i]; ir = tslots[i];
1260:         yv[il]   = PetscMax(yv[il],xv[ir]);
1261:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1262:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1263:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1264:       }
1265: #endif
1266:     }  else {SETERRQ(1,"Wrong insert option");}
1267:   }
1268:   VecRestoreArray(xin,&xv);
1269:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1270:   return(0);
1271: }

1273: /* --------------------------------------------------------------------------------------*/

1275: int VecScatterEnd_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1276: {
1277:   VecScatter_MPI_General *gen_to,*gen_from;
1278:   PetscScalar            *rvalues,*yv,*val;
1279:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1280:   MPI_Request            *rwaits,*swaits;
1281:   MPI_Status             *rstatus,*sstatus;

1284:   if (mode & SCATTER_LOCAL) return(0);
1285:   VecGetArray(yin,&yv);

1287:   if (mode & SCATTER_REVERSE) {
1288:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1289:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1290:     rwaits   = gen_from->rev_requests;
1291:     swaits   = gen_to->rev_requests;
1292:     sstatus  = gen_from->sstatus;
1293:     rstatus  = gen_from->rstatus;
1294:   } else {
1295:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1296:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1297:     rwaits   = gen_from->requests;
1298:     swaits   = gen_to->requests;
1299:     sstatus  = gen_to->sstatus;
1300:     rstatus  = gen_to->rstatus;
1301:   }
1302:   rvalues  = gen_from->values;
1303:   nrecvs   = gen_from->n;
1304:   nsends   = gen_to->n;
1305:   indices  = gen_from->indices;
1306:   rstarts  = gen_from->starts;

1308:   /*  wait on receives */
1309:   count = nrecvs;
1310:   if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
1311:     ierr     = MPI_Waitall(nrecvs,rwaits,rstatus);
1312:     n        = rstarts[count];
1313:     val      = rvalues;
1314:     lindices = indices;
1315:     if (addv == INSERT_VALUES) {
1316:       for (i=0; i<n; i++) {
1317:         idx       = lindices[i];
1318:         yv[idx]   = val[0];
1319:         yv[idx+1] = val[1];
1320:         yv[idx+2] = val[2];
1321:         yv[idx+3] = val[3];
1322:         val      += 4;
1323:       }
1324:     } else if (addv == ADD_VALUES) {
1325:       for (i=0; i<n; i++) {
1326:         idx       = lindices[i];
1327:         yv[idx]   += val[0];
1328:         yv[idx+1] += val[1];
1329:         yv[idx+2] += val[2];
1330:         yv[idx+3] += val[3];
1331:         val       += 4;
1332:       }
1333: #if !defined(PETSC_USE_COMPLEX)
1334:     } else if (addv == MAX_VALUES) {
1335:       for (i=0; i<n; i++) {
1336:         idx       = lindices[i];
1337:         yv[idx]   = PetscMax(yv[idx],val[0]);
1338:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1339:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1340:         yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1341:         val       += 4;
1342:       }
1343: #endif
1344:     }  else {SETERRQ(1,"Wrong insert option");}
1345:   } else { /* unpack each message as it arrives, default version */
1346:     while (count) {
1347:       MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
1348:       /* unpack receives into our local space */
1349:       val      = rvalues + 4*rstarts[imdex];
1350:       lindices = indices + rstarts[imdex];
1351:       n        = rstarts[imdex+1] - rstarts[imdex];
1352:       if (addv == INSERT_VALUES) {
1353:         for (i=0; i<n; i++) {
1354:           idx       = lindices[i];
1355:           yv[idx]   = val[0];
1356:           yv[idx+1] = val[1];
1357:           yv[idx+2] = val[2];
1358:           yv[idx+3] = val[3];
1359:           val      += 4;
1360:         }
1361:       } else if (addv == ADD_VALUES) {
1362:         for (i=0; i<n; i++) {
1363:           idx       = lindices[i];
1364:           yv[idx]   += val[0];
1365:           yv[idx+1] += val[1];
1366:           yv[idx+2] += val[2];
1367:           yv[idx+3] += val[3];
1368:           val       += 4;
1369:         }
1370: #if !defined(PETSC_USE_COMPLEX)
1371:       } else if (addv == MAX_VALUES) {
1372:         for (i=0; i<n; i++) {
1373:           idx       = lindices[i];
1374:           yv[idx]   = PetscMax(yv[idx],val[0]);
1375:           yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1376:           yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1377:           yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1378:           val       += 4;
1379:         }
1380: #endif
1381:       }  else {SETERRQ(1,"Wrong insert option");}
1382:       count--;
1383:     }
1384:   }

1386:   /* wait on sends */
1387:   if (nsends) {
1388:     MPI_Waitall(nsends,swaits,sstatus);
1389:   }
1390:   VecRestoreArray(yin,&yv);
1391:   return(0);
1392: }

1394: /* --------------------------------------------------------------------------------------*/

1396: int VecScatterBegin_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1397: {
1398:   VecScatter_MPI_General *gen_to,*gen_from;
1399:   PetscScalar            *xv,*yv,*val,*svalues;
1400:   MPI_Request            *rwaits,*swaits;
1401:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

1404:   VecGetArray(xin,&xv);
1405:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

1407:   if (mode & SCATTER_REVERSE) {
1408:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1409:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1410:     rwaits   = gen_from->rev_requests;
1411:     swaits   = gen_to->rev_requests;
1412:   } else {
1413:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1414:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1415:     rwaits   = gen_from->requests;
1416:     swaits   = gen_to->requests;
1417:   }
1418:   svalues  = gen_to->values;
1419:   nrecvs   = gen_from->n;
1420:   nsends   = gen_to->n;
1421:   indices  = gen_to->indices;
1422:   sstarts  = gen_to->starts;

1424:   if (!(mode & SCATTER_LOCAL)) {

1426:     if (gen_to->sendfirst) {
1427:       /* this version packs and sends one at a time */
1428:       val  = svalues;
1429:       for (i=0; i<nsends; i++) {
1430:         iend = sstarts[i+1]-sstarts[i];

1432:         for (j=0; j<iend; j++) {
1433:           idx     = *indices++;
1434:           val[0] = xv[idx];
1435:           val[1] = xv[idx+1];
1436:           val[2] = xv[idx+2];
1437:           val    += 3;
1438:         }
1439:         MPI_Start_isend(3*iend,swaits+i);
1440:       }
1441:     }

1443:     if (!gen_from->use_readyreceiver) {
1444:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1445:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1446:     }

1448:     if (!gen_to->sendfirst) {
1449:       /* this version packs all the messages together and sends */
1450:       /*
1451:       len  = 3*sstarts[nsends];
1452:       val  = svalues;
1453:       for (i=0; i<len; i += 3) {
1454:         idx     = *indices++;
1455:         val[0] = xv[idx];
1456:         val[1] = xv[idx+1];
1457:         val[2] = xv[idx+2];
1458:         val      += 3;
1459:       }
1460:       MPI_Startall_isend(len,nsends,swaits);
1461:       */

1463:       /* this version packs and sends one at a time */
1464:       val  = svalues;
1465:       for (i=0; i<nsends; i++) {
1466:         iend = sstarts[i+1]-sstarts[i];

1468:         for (j=0; j<iend; j++) {
1469:           idx     = *indices++;
1470:           val[0] = xv[idx];
1471:           val[1] = xv[idx+1];
1472:           val[2] = xv[idx+2];
1473:           val    += 3;
1474:         }
1475:         MPI_Start_isend(3*iend,swaits+i);
1476:       }
1477:     }
1478:   }

1480:   /* take care of local scatters */
1481:   if (gen_to->local.n) {
1482:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1483:     int n       = gen_to->local.n,il,ir;
1484:     if (addv == INSERT_VALUES) {
1485:       if (gen_to->local.is_copy) {
1486:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1487:       } else {
1488:         for (i=0; i<n; i++) {
1489:           il = fslots[i]; ir = tslots[i];
1490:           yv[il]   = xv[ir];
1491:           yv[il+1] = xv[ir+1];
1492:           yv[il+2] = xv[ir+2];
1493:           yv[il+3] = xv[ir+3];
1494:         }
1495:       }
1496:     }  else if (addv == ADD_VALUES) {
1497:       for (i=0; i<n; i++) {
1498:         il = fslots[i]; ir = tslots[i];
1499:         yv[il]   += xv[ir];
1500:         yv[il+1] += xv[ir+1];
1501:         yv[il+2] += xv[ir+2];
1502:         yv[il+3] += xv[ir+3];
1503:       }
1504: #if !defined(PETSC_USE_COMPLEX)
1505:     }  else if (addv == MAX_VALUES) {
1506:       for (i=0; i<n; i++) {
1507:         il = fslots[i]; ir = tslots[i];
1508:         yv[il]   = PetscMax(yv[il],xv[ir]);
1509:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1510:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1511:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1512:       }
1513: #endif
1514:     }  else {SETERRQ(1,"Wrong insert option");}
1515:   }
1516:   VecRestoreArray(xin,&xv);
1517:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1518:   return(0);
1519: }

1521: /* --------------------------------------------------------------------------------------*/

1523: int VecScatterEnd_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1524: {
1525:   VecScatter_MPI_General *gen_to,*gen_from;
1526:   PetscScalar            *rvalues,*yv,*val;
1527:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1528:   MPI_Request            *rwaits,*swaits;
1529:   MPI_Status             rstatus,*sstatus;

1532:   if (mode & SCATTER_LOCAL) return(0);
1533:   VecGetArray(yin,&yv);

1535:   if (mode & SCATTER_REVERSE) {
1536:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1537:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1538:     rwaits   = gen_from->rev_requests;
1539:     swaits   = gen_to->rev_requests;
1540:     sstatus  = gen_from->sstatus;
1541:   } else {
1542:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1543:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1544:     rwaits   = gen_from->requests;
1545:     swaits   = gen_to->requests;
1546:     sstatus  = gen_to->sstatus;
1547:   }
1548:   rvalues  = gen_from->values;
1549:   nrecvs   = gen_from->n;
1550:   nsends   = gen_to->n;
1551:   indices  = gen_from->indices;
1552:   rstarts  = gen_from->starts;

1554:   /*  wait on receives */
1555:   count = nrecvs;
1556:   while (count) {
1557:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1558:     /* unpack receives into our local space */
1559:     val      = rvalues + 3*rstarts[imdex];
1560:     lindices = indices + rstarts[imdex];
1561:     n        = rstarts[imdex+1] - rstarts[imdex];
1562:     if (addv == INSERT_VALUES) {
1563:       for (i=0; i<n; i++) {
1564:         idx       = lindices[i];
1565:         yv[idx]   = val[0];
1566:         yv[idx+1] = val[1];
1567:         yv[idx+2] = val[2];
1568:         val      += 3;
1569:       }
1570:     } else if (addv == ADD_VALUES) {
1571:       for (i=0; i<n; i++) {
1572:         idx       = lindices[i];
1573:         yv[idx]   += val[0];
1574:         yv[idx+1] += val[1];
1575:         yv[idx+2] += val[2];
1576:         val       += 3;
1577:       }
1578: #if !defined(PETSC_USE_COMPLEX)
1579:     } else if (addv == MAX_VALUES) {
1580:       for (i=0; i<n; i++) {
1581:         idx       = lindices[i];
1582:         yv[idx]   = PetscMax(yv[idx],val[0]);
1583:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1584:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1585:         val       += 3;
1586:       }
1587: #endif
1588:     }  else {SETERRQ(1,"Wrong insert option");}
1589:     count--;
1590:   }
1591:   /* wait on sends */
1592:   if (nsends) {
1593:     MPI_Waitall(nsends,swaits,sstatus);
1594:   }
1595:   VecRestoreArray(yin,&yv);
1596:   return(0);
1597: }

1599: /* --------------------------------------------------------------------------------------*/

1601: int VecScatterBegin_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1602: {
1603:   VecScatter_MPI_General *gen_to,*gen_from;
1604:   PetscScalar            *xv,*yv,*val,*svalues;
1605:   MPI_Request            *rwaits,*swaits;
1606:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

1609:   VecGetArray(xin,&xv);
1610:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
1611:   if (mode & SCATTER_REVERSE) {
1612:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1613:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1614:     rwaits   = gen_from->rev_requests;
1615:     swaits   = gen_to->rev_requests;
1616:   } else {
1617:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1618:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1619:     rwaits   = gen_from->requests;
1620:     swaits   = gen_to->requests;
1621:   }
1622:   svalues  = gen_to->values;
1623:   nrecvs   = gen_from->n;
1624:   nsends   = gen_to->n;
1625:   indices  = gen_to->indices;
1626:   sstarts  = gen_to->starts;

1628:   if (!(mode & SCATTER_LOCAL)) {

1630:     if (gen_to->sendfirst) {
1631:       /* this version packs and sends one at a time */
1632:       val  = svalues;
1633:       for (i=0; i<nsends; i++) {
1634:         iend = sstarts[i+1]-sstarts[i];

1636:         for (j=0; j<iend; j++) {
1637:           idx     = *indices++;
1638:           val[0] = xv[idx];
1639:           val[1] = xv[idx+1];
1640:           val    += 2;
1641:         }
1642:         MPI_Start_isend(2*iend,swaits+i);
1643:       }
1644:     }

1646:     if (!gen_from->use_readyreceiver) {
1647:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1648:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1649:     }

1651:     if (!gen_to->sendfirst) {
1652:       /* this version packs all the messages together and sends */
1653:       /*
1654:       len  = 2*sstarts[nsends];
1655:       val  = svalues;
1656:       for (i=0; i<len; i += 2) {
1657:         idx     = *indices++;
1658:         val[0] = xv[idx];
1659:         val[1] = xv[idx+1];
1660:         val      += 2;
1661:       }
1662:       MPI_Startall_isend(len,nsends,swaits);
1663:       */

1665:       /* this version packs and sends one at a time */
1666:       val  = svalues;
1667:       for (i=0; i<nsends; i++) {
1668:         iend = sstarts[i+1]-sstarts[i];

1670:         for (j=0; j<iend; j++) {
1671:           idx     = *indices++;
1672:           val[0] = xv[idx];
1673:           val[1] = xv[idx+1];
1674:           val    += 2;
1675:         }
1676:         MPI_Start_isend(2*iend,swaits+i);
1677:       }
1678:     }
1679:   }

1681:   /* take care of local scatters */
1682:   if (gen_to->local.n) {
1683:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1684:     int n       = gen_to->local.n,il,ir;
1685:     if (addv == INSERT_VALUES) {
1686:       if (gen_to->local.is_copy) {
1687:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1688:       } else {
1689:         for (i=0; i<n; i++) {
1690:           il = fslots[i]; ir = tslots[i];
1691:           yv[il]   = xv[ir];
1692:           yv[il+1] = xv[ir+1];
1693:         }
1694:       }
1695:     }  else if (addv == ADD_VALUES) {
1696:       for (i=0; i<n; i++) {
1697:         il = fslots[i]; ir = tslots[i];
1698:         yv[il]   += xv[ir];
1699:         yv[il+1] += xv[ir+1];
1700:       }
1701: #if !defined(PETSC_USE_COMPLEX)
1702:     }  else if (addv == MAX_VALUES) {
1703:       for (i=0; i<n; i++) {
1704:         il = fslots[i]; ir = tslots[i];
1705:         yv[il]   = PetscMax(yv[il],xv[ir]);
1706:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1707:       }
1708: #endif
1709:     }  else {SETERRQ(1,"Wrong insert option");}
1710:   }
1711:   VecRestoreArray(xin,&xv);
1712:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1713:   return(0);
1714: }

1716: /* --------------------------------------------------------------------------------------*/

1718: int VecScatterEnd_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1719: {
1720:   VecScatter_MPI_General *gen_to,*gen_from;
1721:   PetscScalar            *rvalues,*yv,*val;
1722:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1723:   MPI_Request            *rwaits,*swaits;
1724:   MPI_Status             rstatus,*sstatus;

1727:   if (mode & SCATTER_LOCAL) return(0);
1728:   VecGetArray(yin,&yv);

1730:   if (mode & SCATTER_REVERSE) {
1731:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1732:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1733:     rwaits   = gen_from->rev_requests;
1734:     swaits   = gen_to->rev_requests;
1735:     sstatus  = gen_from->sstatus;
1736:   } else {
1737:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1738:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1739:     rwaits   = gen_from->requests;
1740:     swaits   = gen_to->requests;
1741:     sstatus  = gen_to->sstatus;
1742:   }
1743:   rvalues  = gen_from->values;
1744:   nrecvs   = gen_from->n;
1745:   nsends   = gen_to->n;
1746:   indices  = gen_from->indices;
1747:   rstarts  = gen_from->starts;

1749:   /*  wait on receives */
1750:   count = nrecvs;
1751:   while (count) {
1752:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1753:     /* unpack receives into our local space */
1754:     val      = rvalues + 2*rstarts[imdex];
1755:     lindices = indices + rstarts[imdex];
1756:     n        = rstarts[imdex+1] - rstarts[imdex];
1757:     if (addv == INSERT_VALUES) {
1758:       for (i=0; i<n; i++) {
1759:         idx       = lindices[i];
1760:         yv[idx]   = val[0];
1761:         yv[idx+1] = val[1];
1762:         val      += 2;
1763:       }
1764:     } else if (addv == ADD_VALUES) {
1765:       for (i=0; i<n; i++) {
1766:         idx       = lindices[i];
1767:         yv[idx]   += val[0];
1768:         yv[idx+1] += val[1];
1769:         val       += 2;
1770:       }
1771: #if !defined(PETSC_USE_COMPLEX)
1772:     } else if (addv == MAX_VALUES) {
1773:       for (i=0; i<n; i++) {
1774:         idx       = lindices[i];
1775:         yv[idx]   = PetscMax(yv[idx],val[0]);
1776:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1777:         val       += 2;
1778:       }
1779: #endif
1780:     }  else {SETERRQ(1,"Wrong insert option");}
1781:     count--;
1782:   }
1783:   /* wait on sends */
1784:   if (nsends) {
1785:     MPI_Waitall(nsends,swaits,sstatus);
1786:   }
1787:   VecRestoreArray(yin,&yv);
1788:   return(0);
1789: }

1791: /* ---------------------------------------------------------------------------------*/

1793: int VecScatterDestroy_PtoP_X(VecScatter ctx)
1794: {
1795:   VecScatter_MPI_General *gen_to   = (VecScatter_MPI_General*)ctx->todata;
1796:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1797:   int                    i,ierr;

1800:   if (gen_to->use_readyreceiver) {
1801:     /*
1802:        Since we have already posted sends we must cancel them before freeing 
1803:        the requests
1804:     */
1805:     for (i=0; i<gen_from->n; i++) {
1806:       MPI_Cancel(gen_from->requests+i);
1807:     }
1808:   }

1810:   if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
1811:   if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
1812:   if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
1813:   if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}

1815:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
1816:   /* 
1817:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
1818:      message passing.
1819:   */
1820: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
1821:   for (i=0; i<gen_to->n; i++) {
1822:     MPI_Request_free(gen_to->requests + i);
1823:     MPI_Request_free(gen_to->rev_requests + i);
1824:   }

1826:   /*
1827:       MPICH could not properly cancel requests thus with ready receiver mode we
1828:     cannot free the requests. It may be fixed now, if not then put the following 
1829:     code inside a if !gen_to->use_readyreceiver) {
1830:   */
1831:   for (i=0; i<gen_from->n; i++) {
1832:     MPI_Request_free(gen_from->requests + i);
1833:     MPI_Request_free(gen_from->rev_requests + i);
1834:   }
1835: #endif
1836: 
1837:   PetscFree(gen_to->sstatus);
1838:   PetscFree(gen_to->values);
1839:   PetscFree(gen_to->rev_requests);
1840:   PetscFree(gen_to);
1841:   PetscFree(gen_from->values);
1842:   PetscFree(gen_from->rev_requests);
1843:   PetscFree(gen_from);
1844:   PetscLogObjectDestroy(ctx);
1845:   PetscHeaderDestroy(ctx);
1846:   return(0);
1847: }

1849: /* ==========================================================================================*/

1851: /*              create parallel to sequential scatter context                           */
1852: /*
1853:    bs indicates how many elements there are in each block. Normally
1854:    this would be 1.
1855: */
1856: int VecScatterCreate_PtoS(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,int bs,VecScatter ctx)
1857: {
1858:   VecScatter_MPI_General *from,*to;
1859:   int                    *source,*lens,rank,*owners;
1860:   int                    size,*lowner,*start,lengthy;
1861:   int                    *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
1862:   int                    *owner,*starts,count,tag,slen,ierr;
1863:   int                    *rvalues,*svalues,base,imdex,nmax,*values,len,*indx,nprocslocal;
1864:   MPI_Comm               comm;
1865:   MPI_Request            *send_waits,*recv_waits;
1866:   MPI_Status             recv_status,*send_status;
1867:   PetscMap               map;
1868:   PetscTruth             found;
1869: 
1871:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1872:   PetscObjectGetComm((PetscObject)xin,&comm);
1873:   MPI_Comm_rank(comm,&rank);
1874:   MPI_Comm_size(comm,&size);
1875:   VecGetPetscMap(xin,&map);
1876:   PetscMapGetGlobalRange(map,&owners);

1878:   VecGetSize(yin,&lengthy);

1880:   /*  first count number of contributors to each processor */
1881:   PetscMalloc(2*size*sizeof(int),&nprocs);
1882:   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));
1883:   procs  = nprocs + size;
1884:   PetscMalloc((nx+1)*sizeof(int),&owner);
1885:   for (i=0; i<nx; i++) {
1886:     idx = inidx[i];
1887:     found = PETSC_FALSE;
1888:     for (j=0; j<size; j++) {
1889:       if (idx >= owners[j] && idx < owners[j+1]) {
1890:         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1891:       }
1892:     }
1893:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
1894:   }
1895:   nprocslocal  = nprocs[rank];
1896:   nprocs[rank] = procs[rank] = 0;
1897:   nsends       = 0;  for (i=0; i<size; i++) { nsends += procs[i];}

1899:   /* inform other processors of number of messages and max length*/
1900:   PetscMalloc(2*size*sizeof(int),&work);
1901:   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
1902:   nmax   = work[rank];
1903:   nrecvs = work[size+rank];
1904:   ierr   = PetscFree(work);

1906:   /* post receives:   */
1907:   PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
1908:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
1909:   for (i=0; i<nrecvs; i++) {
1910:     MPI_Irecv((rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1911:   }

1913:   /* do sends:
1914:       1) starts[i] gives the starting index in svalues for stuff going to 
1915:          the ith processor
1916:   */
1917:   PetscMalloc((nx+1)*sizeof(int),&svalues);
1918:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
1919:   PetscMalloc((size+1)*sizeof(int),&starts);
1920:   starts[0]  = 0;
1921:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1922:   for (i=0; i<nx; i++) {
1923:     if (owner[i] != rank) {
1924:       svalues[starts[owner[i]]++] = inidx[i];
1925:     }
1926:   }

1928:   starts[0] = 0;
1929:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1930:   count = 0;
1931:   for (i=0; i<size; i++) {
1932:     if (procs[i]) {
1933:       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
1934:     }
1935:   }
1936:   PetscFree(starts);

1938:   base = owners[rank];

1940:   /*  wait on receives */
1941:   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
1942:   source = lens + nrecvs;
1943:   count  = nrecvs;
1944:   slen   = 0;
1945:   while (count) {
1946:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1947:     /* unpack receives into our local space */
1948:     MPI_Get_count(&recv_status,MPI_INT,&n);
1949:     source[imdex]  = recv_status.MPI_SOURCE;
1950:     lens[imdex]    = n;
1951:     slen          += n;
1952:     count--;
1953:   }
1954:   PetscFree(recv_waits);
1955: 
1956:   /* allocate entire send scatter context */
1957:   PetscNew(VecScatter_MPI_General,&to);
1958:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1959:   PetscMemzero(to,sizeof(VecScatter_MPI_General));
1960:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
1961:   len = slen*sizeof(int) + bs*slen*sizeof(PetscScalar) + (nrecvs+1)*sizeof(int) +
1962:         nrecvs*(sizeof(int) + sizeof(MPI_Request));
1963:   to->n         = nrecvs;
1964:   PetscMalloc(len,&to->values);
1965:   PetscLogObjectMemory(ctx,len);
1966:   to->requests  = (MPI_Request*)(to->values + bs*slen);
1967:   to->indices   = (int*)(to->requests + nrecvs);
1968:   to->starts    = (int*)(to->indices + slen);
1969:   to->procs     = (int*)(to->starts + nrecvs + 1);
1970:   ierr          = PetscMalloc(2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status),&to->sstatus);
1971:   to->rstatus   = to->sstatus + PetscMax(nrecvs,nsends) + 1;
1972:   PetscLogObjectMemory(ctx,2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status));
1973:   ctx->todata   = (void*)to;
1974:   to->starts[0] = 0;

1976:   if (nrecvs) {
1977:     PetscMalloc(nrecvs*sizeof(int),&indx);
1978:     for (i=0; i<nrecvs; i++) indx[i] = i;
1979:     PetscSortIntWithPermutation(nrecvs,source,indx);

1981:     /* move the data into the send scatter */
1982:     for (i=0; i<nrecvs; i++) {
1983:       to->starts[i+1] = to->starts[i] + lens[indx[i]];
1984:       to->procs[i]    = source[indx[i]];
1985:       values = rvalues + indx[i]*nmax;
1986:       for (j=0; j<lens[indx[i]]; j++) {
1987:         to->indices[to->starts[i] + j] = values[j] - base;
1988:       }
1989:     }
1990:     PetscFree(indx);
1991:   }
1992:   PetscFree(rvalues);
1993:   PetscFree(lens);
1994: 
1995:   /* allocate entire receive scatter context */
1996:   PetscNew(VecScatter_MPI_General,&from);
1997:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1998:   PetscMemzero(from,sizeof(VecScatter_MPI_General));
1999:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2000:   len = ny*sizeof(int) + ny*bs*sizeof(PetscScalar) + (nsends+1)*sizeof(int) +
2001:         nsends*(sizeof(int) + sizeof(MPI_Request));
2002:   from->n        = nsends;
2003:   PetscMalloc(len,&from->values);
2004:   PetscLogObjectMemory(ctx,len);
2005:   from->requests = (MPI_Request*)(from->values + bs*ny);
2006:   from->indices  = (int*)(from->requests + nsends);
2007:   from->starts   = (int*)(from->indices + ny);
2008:   from->procs    = (int*)(from->starts + nsends + 1);
2009:   ctx->fromdata  = (void*)from;

2011:   /* move data into receive scatter */
2012:   PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2013:   start = lowner + size;
2014:   count = 0; from->starts[0] = start[0] = 0;
2015:   for (i=0; i<size; i++) {
2016:     if (procs[i]) {
2017:       lowner[i]            = count;
2018:       from->procs[count++] = i;
2019:       from->starts[count]  = start[count] = start[count-1] + nprocs[i];
2020:     }
2021:   }
2022:   for (i=0; i<nx; i++) {
2023:     if (owner[i] != rank) {
2024:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
2025:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2026:     }
2027:   }
2028:   PetscFree(lowner);
2029:   PetscFree(owner);
2030:   PetscFree(nprocs);
2031: 
2032:   /* wait on sends */
2033:   if (nsends) {
2034:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2035:     MPI_Waitall(nsends,send_waits,send_status);
2036:     PetscFree(send_status);
2037:   }
2038:   PetscFree(send_waits);
2039:   PetscFree(svalues);

2041:   if (nprocslocal) {
2042:     int nt;
2043:     /* we have a scatter to ourselves */
2044:     from->local.n     = to->local.n = nt = nprocslocal;
2045:     PetscMalloc(nt*sizeof(int),&from->local.slots);
2046:     PetscMalloc(nt*sizeof(int),&to->local.slots);
2047:     PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2048:     nt = 0;
2049:     for (i=0; i<nx; i++) {
2050:       idx = inidx[i];
2051:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2052:         to->local.slots[nt]     = idx - owners[rank];
2053:         from->local.slots[nt++] = inidy[i];
2054:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2055:       }
2056:     }
2057:   } else {
2058:     from->local.n     = 0;
2059:     from->local.slots = 0;
2060:     to->local.n       = 0;
2061:     to->local.slots   = 0;
2062:   }
2063:   from->local.nonmatching_computed = PETSC_FALSE;
2064:   from->local.n_nonmatching        = 0;
2065:   from->local.slots_nonmatching    = 0;
2066:   to->local.nonmatching_computed   = PETSC_FALSE;
2067:   to->local.n_nonmatching          = 0;
2068:   to->local.slots_nonmatching      = 0;

2070:   to->type   = VEC_SCATTER_MPI_GENERAL;
2071:   from->type = VEC_SCATTER_MPI_GENERAL;

2073:   from->bs = bs;
2074:   to->bs   = bs;
2075:   if (bs > 1) {
2076:     PetscTruth  flg,flgs = PETSC_FALSE;
2077:     int         *sstarts = to->starts,  *rstarts = from->starts;
2078:     int         *sprocs  = to->procs,   *rprocs  = from->procs;
2079:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
2080:     MPI_Request *rev_swaits,*rev_rwaits;
2081:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

2083:     ctx->destroy   = VecScatterDestroy_PtoP_X;
2084:     ctx->copy      = VecScatterCopy_PtoP_X;
2085:     ctx->view      = VecScatterView_MPI;
2086: 
2087:     tag      = ctx->tag;
2088:     comm     = ctx->comm;

2090:     /* allocate additional wait variables for the "reverse" scatter */
2091:     PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&rev_rwaits);
2092:     to->rev_requests   = rev_rwaits;
2093:     PetscMalloc((nsends+1)*sizeof(MPI_Request),&rev_swaits);
2094:     from->rev_requests = rev_swaits;
2095:     PetscLogObjectMemory(ctx,(nsends+nrecvs+2)*sizeof(MPI_Request));

2097:     /* Register the receives that you will use later (sends for scatter reverse) */
2098:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flgs);
2099:     if (flgs) {
2100:       PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter Ssend moden");
2101:     }
2102:     for (i=0; i<from->n; i++) {
2103:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2104:                            comm,rwaits+i);
2105:       if (!flgs) {
2106:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2107:                              comm,rev_swaits+i);
2108:       } else {
2109:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2110:                               comm,rev_swaits+i);
2111:       }
2112:     }

2114:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
2115:     if (flg) {
2116:       ctx->postrecvs           = VecScatterPostRecvs_PtoP_X;
2117:       to->use_readyreceiver    = PETSC_TRUE;
2118:       from->use_readyreceiver  = PETSC_TRUE;
2119:       for (i=0; i<to->n; i++) {
2120:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2121:                               comm,swaits+i);
2122:       }
2123:       PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter ready receiver moden");
2124:     } else {
2125:       ctx->postrecvs           = 0;
2126:       to->use_readyreceiver    = PETSC_FALSE;
2127:       from->use_readyreceiver  = PETSC_FALSE;
2128:       for (i=0; i<to->n; i++) {
2129:         if (!flgs) {
2130:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2131:                                comm,swaits+i);
2132:         } else {
2133:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2134:                                comm,swaits+i);
2135:         }
2136:       }
2137:     }
2138:     /* Register receives for scatter reverse */
2139:     for (i=0; i<to->n; i++) {
2140:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2141:                            comm,rev_rwaits+i);
2142:     }

2144:     PetscLogInfo(0,"VecScatterCreate_PtoS:Using blocksize %d scattern",bs);
2145:     switch (bs) {
2146:     case 12:
2147:       ctx->begin     = VecScatterBegin_PtoP_12;
2148:       ctx->end       = VecScatterEnd_PtoP_12;
2149:       break;
2150:     case 5:
2151:       ctx->begin     = VecScatterBegin_PtoP_5;
2152:       ctx->end       = VecScatterEnd_PtoP_5;
2153:       break;
2154:     case 4:
2155:       ctx->begin     = VecScatterBegin_PtoP_4;
2156:       ctx->end       = VecScatterEnd_PtoP_4;
2157:       break;
2158:     case 3:
2159:       ctx->begin     = VecScatterBegin_PtoP_3;
2160:       ctx->end       = VecScatterEnd_PtoP_3;
2161:       break;
2162:     case 2:
2163:       ctx->begin     = VecScatterBegin_PtoP_2;
2164:       ctx->end       = VecScatterEnd_PtoP_2;
2165:       break;
2166:     default:
2167:       SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
2168:     }
2169:   } else {
2170:     ctx->postrecvs = 0;
2171:     ctx->destroy   = VecScatterDestroy_PtoP;
2172:     ctx->begin     = VecScatterBegin_PtoP;
2173:     ctx->end       = VecScatterEnd_PtoP;
2174:     ctx->copy      = VecScatterCopy_PtoP;
2175:     ctx->view      = VecScatterView_MPI;
2176:   }

2178:   /* Check if the local scatter is actually a copy; important special case */
2179:   if (nprocslocal) {
2180:     VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
2181:   }

2183:   return(0);
2184: }

2186: /* ------------------------------------------------------------------------------------*/
2187: /*
2188:          Scatter from local Seq vectors to a parallel vector. 
2189: */
2190: int VecScatterCreate_StoP(int nx,int *inidx,int ny,int *inidy,Vec yin,VecScatter ctx)
2191: {
2192:   VecScatter_MPI_General *from,*to;
2193:   int                    *source,nprocslocal,*lens,rank = yin->stash.rank,*owners = yin->map->range;
2194:   int                    ierr,size = yin->stash.size,*lowner,*start;
2195:   int                    *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
2196:   int                    *owner,*starts,count,tag,slen;
2197:   int                    *rvalues,*svalues,base,imdex,nmax,*values,len;
2198:   PetscTruth             found;
2199:   MPI_Comm               comm = yin->comm;
2200:   MPI_Request            *send_waits,*recv_waits;
2201:   MPI_Status             recv_status,*send_status;

2204:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2205:   /*  first count number of contributors to each processor */
2206:   PetscMalloc(2*size*sizeof(int),&nprocs);
2207:   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));
2208:   procs  = nprocs + size;
2209:   PetscMalloc((nx+1)*sizeof(int),&owner);
2210:   for (i=0; i<nx; i++) {
2211:     idx = inidy[i];
2212:     found = PETSC_FALSE;
2213:     for (j=0; j<size; j++) {
2214:       if (idx >= owners[j] && idx < owners[j+1]) {
2215:         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
2216:       }
2217:     }
2218:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2219:   }
2220:   nprocslocal  = nprocs[rank];
2221:   nprocs[rank] = procs[rank] = 0;
2222:   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}

2224:   /* inform other processors of number of messages and max length*/
2225:   ierr   = PetscMalloc(2*size*sizeof(int),&work);
2226:   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
2227:   nmax   = work[rank];
2228:   nrecvs = work[size+rank];
2229:   ierr   = PetscFree(work);

2231:   /* post receives:   */
2232:   PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2233:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2234:   for (i=0; i<nrecvs; i++) {
2235:     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2236:   }

2238:   /* do sends:
2239:       1) starts[i] gives the starting index in svalues for stuff going to 
2240:          the ith processor
2241:   */
2242:   PetscMalloc((nx+1)*sizeof(int),&svalues);
2243:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2244:   PetscMalloc((size+1)*sizeof(int),&starts);
2245:   starts[0]  = 0;
2246:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2247:   for (i=0; i<nx; i++) {
2248:     if (owner[i] != rank) {
2249:       svalues[starts[owner[i]]++] = inidy[i];
2250:     }
2251:   }

2253:   starts[0] = 0;
2254:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2255:   count = 0;
2256:   for (i=0; i<size; i++) {
2257:     if (procs[i]) {
2258:       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count);
2259:       count++;
2260:     }
2261:   }
2262:   PetscFree(starts);

2264:   /* allocate entire send scatter context */
2265:   PetscNew(VecScatter_MPI_General,&to);
2266:   PetscMemzero(to,sizeof(VecScatter_MPI_General));
2267:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2268:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
2269:   len  = ny*(sizeof(int) + sizeof(PetscScalar)) + (nsends+1)*sizeof(int) +
2270:         nsends*(sizeof(int) + sizeof(MPI_Request));
2271:   to->n        = nsends;
2272:   PetscMalloc(len,&to->values);
2273:   PetscLogObjectMemory(ctx,len);
2274:   to->requests = (MPI_Request*)(to->values + ny);
2275:   to->indices  = (int*)(to->requests + nsends);
2276:   to->starts   = (int*)(to->indices + ny);
2277:   to->procs    = (int*)(to->starts + nsends + 1);
2278:   ierr         = PetscMalloc((PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status),&to->sstatus);
2279:   to->rstatus  = to->sstatus + PetscMax(nsends,nrecvs) + 1;
2280:   PetscLogObjectMemory(ctx,2*(PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status));
2281:   ctx->todata  = (void*)to;

2283:   /* move data into send scatter context */
2284:   PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2285:   start         = lowner + size;
2286:   count         = 0;
2287:   to->starts[0] = start[0] = 0;
2288:   for (i=0; i<size; i++) {
2289:     if (procs[i]) {
2290:       lowner[i]          = count;
2291:       to->procs[count++] = i;
2292:       to->starts[count]  = start[count] = start[count-1] + nprocs[i];
2293:     }
2294:   }
2295:   for (i=0; i<nx; i++) {
2296:     if (owner[i] != rank) {
2297:       to->indices[start[lowner[owner[i]]]++] = inidx[i];
2298:     }
2299:   }
2300:   PetscFree(lowner);
2301:   PetscFree(owner);
2302:   PetscFree(nprocs);

2304:   base = owners[rank];

2306:   /*  wait on receives */
2307:   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2308:   source = lens + nrecvs;
2309:   count  = nrecvs; slen = 0;
2310:   while (count) {
2311:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2312:     /* unpack receives into our local space */
2313:     MPI_Get_count(&recv_status,MPI_INT,&n);
2314:     source[imdex]  = recv_status.MPI_SOURCE;
2315:     lens[imdex]    = n;
2316:     slen          += n;
2317:     count--;
2318:   }
2319:   PetscFree(recv_waits);
2320: 
2321:   /* allocate entire receive scatter context */
2322:   PetscNew(VecScatter_MPI_General,&from);
2323:   PetscMemzero(from,sizeof(VecScatter_MPI_General));
2324:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2325:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
2326:   len  = slen*(sizeof(int) + sizeof(PetscScalar)) + (nrecvs+1)*sizeof(int) +
2327:           nrecvs*(sizeof(int) + sizeof(MPI_Request));
2328:   from->n        = nrecvs;
2329:   PetscMalloc(len,&from->values);
2330:   PetscLogObjectMemory(ctx,len);
2331:   from->requests = (MPI_Request*)(from->values + slen);
2332:   from->indices  = (int*)(from->requests + nrecvs);
2333:   from->starts   = (int*)(from->indices + slen);
2334:   from->procs    = (int*)(from->starts + nrecvs + 1);
2335:   ctx->fromdata  = (void*)from;

2337:   /* move the data into the receive scatter context*/
2338:   from->starts[0] = 0;
2339:   for (i=0; i<nrecvs; i++) {
2340:     from->starts[i+1] = from->starts[i] + lens[i];
2341:     from->procs[i]    = source[i];
2342:     values            = rvalues + i*nmax;
2343:     for (j=0; j<lens[i]; j++) {
2344:       from->indices[from->starts[i] + j] = values[j] - base;
2345:     }
2346:   }
2347:   PetscFree(rvalues);
2348:   PetscFree(lens);
2349: 
2350:   /* wait on sends */
2351:   if (nsends) {
2352:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2353:     MPI_Waitall(nsends,send_waits,send_status);
2354:     PetscFree(send_status);
2355:   }
2356:   PetscFree(send_waits);
2357:   PetscFree(svalues);

2359:   if (nprocslocal) {
2360:     int nt;
2361:     /* we have a scatter to ourselves */
2362:     from->local.n     = to->local.n = nt = nprocslocal;
2363:     PetscMalloc(nt*sizeof(int),&from->local.slots);
2364:     PetscMalloc(nt*sizeof(int),&to->local.slots);
2365:     PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2366:     nt = 0;
2367:     for (i=0; i<ny; i++) {
2368:       idx = inidy[i];
2369:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2370:         from->local.slots[nt] = idx - owners[rank];
2371:         to->local.slots[nt++] = inidx[i];
2372:       }
2373:     }
2374:   } else {
2375:     from->local.n     = 0;
2376:     from->local.slots = 0;
2377:     to->local.n       = 0;
2378:     to->local.slots   = 0;

2380:   }
2381:   from->local.nonmatching_computed = PETSC_FALSE;
2382:   from->local.n_nonmatching        = 0;
2383:   from->local.slots_nonmatching    = 0;
2384:   to->local.nonmatching_computed   = PETSC_FALSE;
2385:   to->local.n_nonmatching          = 0;
2386:   to->local.slots_nonmatching      = 0;

2388:   to->type   = VEC_SCATTER_MPI_GENERAL;
2389:   from->type = VEC_SCATTER_MPI_GENERAL;

2391:   ctx->destroy    = VecScatterDestroy_PtoP;
2392:   ctx->postrecvs  = 0;
2393:   ctx->begin      = VecScatterBegin_PtoP;
2394:   ctx->end        = VecScatterEnd_PtoP;
2395:   ctx->copy       = 0;
2396:   ctx->view       = VecScatterView_MPI;

2398:   to->bs   = 1;
2399:   from->bs = 1;
2400:   return(0);
2401: }

2403: /* ---------------------------------------------------------------------------------*/
2404: int VecScatterCreate_PtoP(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,VecScatter ctx)
2405: {
2406:   int         *lens,rank,*owners = xin->map->range,size;
2407:   int         *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work,*local_inidx,*local_inidy;
2408:   int         *owner,*starts,count,tag,slen,ierr;
2409:   int         *rvalues,*svalues,base,imdex,nmax,*values;
2410:   MPI_Comm    comm;
2411:   MPI_Request *send_waits,*recv_waits;
2412:   MPI_Status  recv_status;
2413:   PetscTruth  duplicate = PETSC_FALSE,found;

2416:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2417:   PetscObjectGetComm((PetscObject)xin,&comm);
2418:   MPI_Comm_size(comm,&size);
2419:   MPI_Comm_rank(comm,&rank);
2420:   if (size == 1) {
2421:     VecScatterCreate_StoP(nx,inidx,ny,inidy,yin,ctx);
2422:     return(0);
2423:   }

2425:   /*
2426:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2427:      They then call the StoPScatterCreate()
2428:   */
2429:   /*  first count number of contributors to each processor */
2430:   ierr  = PetscMalloc(2*size*sizeof(int),&nprocs);
2431:   procs = nprocs + size;
2432:   ierr  = PetscMemzero(nprocs,2*size*sizeof(int));
2433:   ierr  = PetscMalloc((nx+1)*sizeof(int),&owner);
2434:   for (i=0; i<nx; i++) {
2435:     idx   = inidx[i];
2436:     found = PETSC_FALSE;
2437:     for (j=0; j<size; j++) {
2438:       if (idx >= owners[j] && idx < owners[j+1]) {
2439:         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
2440:       }
2441:     }
2442:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2443:   }
2444:   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}

2446:   /* inform other processors of number of messages and max length*/
2447:   PetscMalloc(2*size*sizeof(int),&work);
2448:   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
2449:   nmax   = work[rank];
2450:   nrecvs = work[size+rank];
2451:   ierr   = PetscFree(work);

2453:   /* post receives:   */
2454:   PetscMalloc(2*(nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2455:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2456:   for (i=0; i<nrecvs; i++) {
2457:     MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2458:   }

2460:   /* do sends:
2461:       1) starts[i] gives the starting index in svalues for stuff going to 
2462:          the ith processor
2463:   */
2464:   PetscMalloc(2*(nx+1)*sizeof(int),&svalues);
2465:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2466:   PetscMalloc((size+1)*sizeof(int),&starts);
2467:   starts[0]  = 0;
2468:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2469:   for (i=0; i<nx; i++) {
2470:     svalues[2*starts[owner[i]]]       = inidx[i];
2471:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2472:   }
2473:   PetscFree(owner);

2475:   starts[0] = 0;
2476:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2477:   count = 0;
2478:   for (i=0; i<size; i++) {
2479:     if (procs[i]) {
2480:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPI_INT,i,tag,comm,send_waits+count);
2481:       count++;
2482:     }
2483:   }
2484:   PetscFree(starts);
2485:   PetscFree(nprocs);

2487:   base = owners[rank];

2489:   /*  wait on receives */
2490:   PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2491:   count = nrecvs;
2492:   slen  = 0;
2493:   while (count) {
2494:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2495:     /* unpack receives into our local space */
2496:     MPI_Get_count(&recv_status,MPI_INT,&n);
2497:     lens[imdex]  =  n/2;
2498:     slen         += n/2;
2499:     count--;
2500:   }
2501:   PetscFree(recv_waits);
2502: 
2503:   PetscMalloc(2*(slen+1)*sizeof(int),&local_inidx);
2504:   local_inidy = local_inidx + slen;

2506:   count = 0;
2507:   for (i=0; i<nrecvs; i++) {
2508:     values = rvalues + 2*i*nmax;
2509:     for (j=0; j<lens[i]; j++) {
2510:       local_inidx[count]   = values[2*j] - base;
2511:       local_inidy[count++] = values[2*j+1];
2512:     }
2513:   }
2514:   PetscFree(rvalues);
2515:   PetscFree(lens);

2517:   /* wait on sends */
2518:   if (nsends) {
2519:     MPI_Status *send_status;
2520:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2521:     MPI_Waitall(nsends,send_waits,send_status);
2522:     PetscFree(send_status);
2523:   }
2524:   PetscFree(send_waits);
2525:   PetscFree(svalues);

2527:   /*
2528:      should sort and remove duplicates from local_inidx,local_inidy 
2529:   */

2531: #if defined(do_it_slow)
2532:   /* sort on the from index */
2533:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2534:   start = 0;
2535:   while (start < slen) {
2536:     count = start+1;
2537:     last  = local_inidx[start];
2538:     while (count < slen && last == local_inidx[count]) count++;
2539:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2540:       /* sort on to index */
2541:       PetscSortInt(count-start,local_inidy+start);
2542:     }
2543:     /* remove duplicates; not most efficient way, but probably good enough */
2544:     i = start;
2545:     while (i < count-1) {
2546:       if (local_inidy[i] != local_inidy[i+1]) {
2547:         i++;
2548:       } else { /* found a duplicate */
2549:         duplicate = PETSC_TRUE;
2550:         for (j=i; j<slen-1; j++) {
2551:           local_inidx[j] = local_inidx[j+1];
2552:           local_inidy[j] = local_inidy[j+1];
2553:         }
2554:         slen--;
2555:         count--;
2556:         /* printf("found dup %d %dn",local_inidx[i],local_inidy[i]);*/
2557:       }
2558:     }
2559:     start = count;
2560:   }
2561: #endif
2562:   if (duplicate) {
2563:     PetscLogInfo(0,"VecScatterCreate_PtoP:Duplicate to from indices passed in VecScatterCreate(), they are ignoredn");
2564:   }
2565:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,yin,ctx);
2566:   PetscFree(local_inidx);
2567:   return(0);
2568: }