Actual source code: comb.c

  1: #define PETSCVEC_DLL
  2: /*
  3:       Split phase global vector reductions with support for combining the
  4:    communication portion of several operations. Using MPI-1.1 support only

  6:       The idea for this and much of the initial code is contributed by 
  7:    Victor Eijkhout.

  9:        Usage:
 10:              VecDotBegin(Vec,Vec,PetscScalar *);
 11:              VecNormBegin(Vec,NormType,PetscReal *);
 12:              ....
 13:              VecDotEnd(Vec,Vec,PetscScalar *);
 14:              VecNormEnd(Vec,NormType,PetscReal *);

 16:        Limitations: 
 17:          - The order of the xxxEnd() functions MUST be in the same order
 18:            as the xxxBegin(). There is extensive error checking to try to 
 19:            insure that the user calls the routines in the correct order
 20: */

 22:  #include vecimpl.h

 24: #define STATE_BEGIN 0
 25: #define STATE_END   1

 27: #define REDUCE_SUM  0
 28: #define REDUCE_MAX  1
 29: #define REDUCE_MIN  2

 31: typedef struct {
 32:   MPI_Comm     comm;
 33:   PetscScalar  *lvalues;    /* this are the reduced values before call to MPI_Allreduce() */
 34:   PetscScalar  *gvalues;    /* values after call to MPI_Allreduce() */
 35:   void         **invecs;    /* for debugging only, vector/memory used with each op */
 36:   PetscInt     *reducetype; /* is particular value to be summed or maxed? */
 37:   PetscInt     state;       /* are we calling xxxBegin() or xxxEnd()? */
 38:   PetscInt     maxops;      /* total amount of space we have for requests */
 39:   PetscInt     numopsbegin; /* number of requests that have been queued in */
 40:   PetscInt     numopsend;   /* number of requests that have been gotten by user */
 41: } PetscSplitReduction;
 42: /*
 43:    Note: the lvalues and gvalues are twice as long as maxops, this is to allow the second half of
 44: the entries to have a flag indicating if they are REDUCE_SUM, REDUCE_MAX, or REDUCE_MIN these are used by 
 45: the custom reduction operation that replaces MPI_SUM, MPI_MAX, or MPI_MIN in the case when a reduction involves
 46: some of each.
 47: */

 51: /*
 52:    PetscSplitReductionCreate - Creates a data structure to contain the queued information.
 53: */
 54: PetscErrorCode PETSCVEC_DLLEXPORT PetscSplitReductionCreate(MPI_Comm comm,PetscSplitReduction **sr)
 55: {

 59:   PetscNew(PetscSplitReduction,sr);
 60:   (*sr)->numopsbegin = 0;
 61:   (*sr)->numopsend   = 0;
 62:   (*sr)->state       = STATE_BEGIN;
 63:   (*sr)->maxops      = 32;
 64:   PetscMalloc(2*32*sizeof(PetscScalar),&(*sr)->lvalues);
 65:   PetscMalloc(2*32*sizeof(PetscScalar),&(*sr)->gvalues);
 66:   PetscMalloc(32*sizeof(void*),&(*sr)->invecs);
 67:   (*sr)->comm        = comm;
 68:   PetscMalloc(32*sizeof(PetscInt),&(*sr)->reducetype);
 69:   return(0);
 70: }

 72: /*
 73:        This function is the MPI reduction operation used when there is 
 74:    a combination of sums and max in the reduction. The call below to 
 75:    MPI_Op_create() converts the function PetscSplitReduction_Local() to the 
 76:    MPI operator PetscSplitReduction_Op.
 77: */
 78: MPI_Op PetscSplitReduction_Op = 0;

 83: void PETSCVEC_DLLEXPORT PetscSplitReduction_Local(void *in,void *out,PetscMPIInt *cnt,MPI_Datatype *datatype)
 84: {
 85:   PetscScalar *xin = (PetscScalar *)in,*xout = (PetscScalar*)out;
 86:   PetscInt    i,count = (PetscInt)*cnt;

 89:   if (*datatype != MPIU_REAL) {
 90:     (*PetscErrorPrintf)("Can only handle MPIU_REAL data types");
 91:     MPI_Abort(MPI_COMM_WORLD,1);
 92:   }
 93: #if defined(PETSC_USE_COMPLEX)
 94:   count = count/2;
 95: #endif
 96:   count = count/2;
 97:   for (i=0; i<count; i++) {
 98:     if (((int)PetscRealPart(xin[count+i])) == REDUCE_SUM) { /* second half of xin[] is flags for reduction type */
 99:       xout[i] += xin[i];
100:     } else if ((PetscInt)PetscRealPart(xin[count+i]) == REDUCE_MAX) {
101:       xout[i] = PetscMax(*(PetscReal *)(xout+i),*(PetscReal *)(xin+i));
102:     } else if ((PetscInt)PetscRealPart(xin[count+i]) == REDUCE_MIN) {
103:       xout[i] = PetscMin(*(PetscReal *)(xout+i),*(PetscReal *)(xin+i));
104:     } else {
105:       (*PetscErrorPrintf)("Reduction type input is not REDUCE_SUM, REDUCE_MAX, or REDUCE_MIN");
106:       MPI_Abort(MPI_COMM_WORLD,1);
107:     }
108:   }
109:   PetscStackPop; /* since function returns void cannot use PetscFunctionReturn(); */
110:   return;
111: }

116: /*
117:    PetscSplitReductionApply - Actually do the communication required for a split phase reduction
118: */
119: PetscErrorCode PETSCVEC_DLLEXPORT PetscSplitReductionApply(PetscSplitReduction *sr)
120: {
122:   PetscInt            i,numops = sr->numopsbegin,*reducetype = sr->reducetype;
123:   PetscScalar    *lvalues = sr->lvalues,*gvalues = sr->gvalues;
124:   PetscInt            sum_flg = 0,max_flg = 0, min_flg = 0;
125:   MPI_Comm       comm = sr->comm;
126:   PetscMPIInt    size;

129:   if (sr->numopsend > 0) {
130:     SETERRQ(PETSC_ERR_ORDER,"Cannot call this after VecxxxEnd() has been called");
131:   }

133:   PetscLogEventBarrierBegin(VEC_ReduceBarrier,0,0,0,0,comm);
134:   MPI_Comm_size(sr->comm,&size);
135:   if (size == 1) {
136:     PetscMemcpy(gvalues,lvalues,numops*sizeof(PetscScalar));
137:   } else {
138:     /* determine if all reductions are sum, max, or min */
139:     for (i=0; i<numops; i++) {
140:       if (reducetype[i] == REDUCE_MAX) {
141:         max_flg = 1;
142:       } else if (reducetype[i] == REDUCE_SUM) {
143:         sum_flg = 1;
144:       } else if (reducetype[i] == REDUCE_MIN) {
145:         min_flg = 1;
146:       } else {
147:         SETERRQ(PETSC_ERR_PLIB,"Error in PetscSplitReduction() data structure, probably memory corruption");
148:       }
149:     }
150:     if (sum_flg + max_flg + min_flg > 1) {
151:       /* 
152:          after all the entires in lvalues we store the reducetype flags to indicate
153:          to the reduction operations what are sums and what are max
154:       */
155:       for (i=0; i<numops; i++) {
156:         lvalues[numops+i] = reducetype[i];
157:       }
158: #if defined(PETSC_USE_COMPLEX)
159:       MPI_Allreduce(lvalues,gvalues,2*2*numops,MPIU_REAL,PetscSplitReduction_Op,comm);
160: #else
161:       MPI_Allreduce(lvalues,gvalues,2*numops,MPIU_REAL,PetscSplitReduction_Op,comm);
162: #endif
163:     } else if (max_flg) {
164: #if defined(PETSC_USE_COMPLEX)
165:       /* 
166:         complex case we max both the real and imaginary parts, the imaginary part
167:         is just ignored later
168:       */
169:       MPI_Allreduce(lvalues,gvalues,2*numops,MPIU_REAL,MPI_MAX,comm);
170: #else
171:       MPI_Allreduce(lvalues,gvalues,numops,MPIU_REAL,MPI_MAX,comm);
172: #endif
173:     } else if (min_flg) {
174: #if defined(PETSC_USE_COMPLEX)
175:       /* 
176:         complex case we min both the real and imaginary parts, the imaginary part
177:         is just ignored later
178:       */
179:       MPI_Allreduce(lvalues,gvalues,2*numops,MPIU_REAL,MPI_MIN,comm);
180: #else
181:       MPI_Allreduce(lvalues,gvalues,numops,MPIU_REAL,MPI_MIN,comm);
182: #endif
183:     } else {
184:       MPI_Allreduce(lvalues,gvalues,numops,MPIU_SCALAR,PetscSum_Op,comm);
185:     }
186:   }
187:   sr->state     = STATE_END;
188:   sr->numopsend = 0;
189:   PetscLogEventBarrierEnd(VEC_ReduceBarrier,0,0,0,0,comm);
190:   return(0);
191: }


196: /*
197:    PetscSplitReductionExtend - Double the amount of space (slots) allocated for a split reduction object.
198: */
199: PetscErrorCode PETSCVEC_DLLEXPORT PetscSplitReductionExtend(PetscSplitReduction *sr)
200: {
202:   PetscInt         maxops = sr->maxops,*reducetype = sr->reducetype;
203:   PetscScalar *lvalues = sr->lvalues,*gvalues = sr->gvalues;
204:   void        *invecs = sr->invecs;

207:   sr->maxops     = 2*maxops;
208:   PetscMalloc(2*2*maxops*sizeof(PetscScalar),&sr->lvalues);
209:   PetscMalloc(2*2*maxops*sizeof(PetscScalar),&sr->gvalues);
210:   PetscMalloc(2*maxops*sizeof(PetscInt),&sr->reducetype);
211:   PetscMalloc(2*maxops*sizeof(void*),&sr->invecs);
212:   PetscMemcpy(sr->lvalues,lvalues,maxops*sizeof(PetscScalar));
213:   PetscMemcpy(sr->gvalues,gvalues,maxops*sizeof(PetscScalar));
214:   PetscMemcpy(sr->reducetype,reducetype,maxops*sizeof(PetscInt));
215:   PetscMemcpy(sr->invecs,invecs,maxops*sizeof(void*));
216:   PetscFree(lvalues);
217:   PetscFree(gvalues);
218:   PetscFree(reducetype);
219:   PetscFree(invecs);
220:   return(0);
221: }

225: PetscErrorCode PETSCVEC_DLLEXPORT PetscSplitReductionDestroy(PetscSplitReduction *sr)
226: {

230:   PetscFree(sr->lvalues);
231:   PetscFree(sr->gvalues);
232:   PetscFree(sr->reducetype);
233:   PetscFree(sr->invecs);
234:   PetscFree(sr);
235:   return(0);
236: }

238: static PetscMPIInt Petsc_Reduction_keyval = MPI_KEYVAL_INVALID;

243: /*
244:    Private routine to delete internal storage when a communicator is freed.
245:   This is called by MPI, not by users.

247:   The binding for the first argument changed from MPI 1.0 to 1.1; in 1.0
248:   it was MPI_Comm *comm.  
249: */
250: int PETSCVEC_DLLEXPORT Petsc_DelReduction(MPI_Comm comm,int keyval,void* attr_val,void* extra_state)
251: {

255:   PetscLogInfo((0,"Petsc_DelReduction:Deleting reduction data in an MPI_Comm %ld\n",(long)comm));
256:   PetscSplitReductionDestroy((PetscSplitReduction *)attr_val);
257:   return(0);
258: }

261: /*
262:      PetscSplitReductionGet - Gets the split reduction object from a 
263:         PETSc vector, creates if it does not exit.

265: */
268: PetscErrorCode PETSCVEC_DLLEXPORT PetscSplitReductionGet(MPI_Comm comm,PetscSplitReduction **sr)
269: {
271:   PetscMPIInt    flag;

274:   if (Petsc_Reduction_keyval == MPI_KEYVAL_INVALID) {
275:     /* 
276:        The calling sequence of the 2nd argument to this function changed
277:        between MPI Standard 1.0 and the revisions 1.1 Here we match the 
278:        new standard, if you are using an MPI implementation that uses 
279:        the older version you will get a warning message about the next line;
280:        it is only a warning message and should do no harm.
281:     */
282:     MPI_Keyval_create(MPI_NULL_COPY_FN,Petsc_DelReduction,&Petsc_Reduction_keyval,0);
283:   }
284:   MPI_Attr_get(comm,Petsc_Reduction_keyval,(void **)sr,&flag);
285:   if (!flag) {  /* doesn't exist yet so create it and put it in */
286:     PetscSplitReductionCreate(comm,sr);
287:     MPI_Attr_put(comm,Petsc_Reduction_keyval,*sr);
288:     PetscLogInfo((0,"PetscSplitReductionGet:Putting reduction data in an MPI_Comm %ld\n",(long)comm));
289:   }

291:   return(0);
292: }

294: /* ----------------------------------------------------------------------------------------------------*/

298: /*@
299:    VecDotBegin - Starts a split phase dot product computation.

301:    Input Parameters:
302: +   x - the first vector
303: .   y - the second vector
304: -   result - where the result will go (can be PETSC_NULL)

306:    Level: advanced

308:    Notes:
309:    Each call to VecDotBegin() should be paired with a call to VecDotEnd().

311: seealso: VecDotEnd(), VecNormBegin(), VecNormEnd(), VecNorm(), VecDot(), VecMDot(), 
312:          VecTDotBegin(), VecTDotEnd()
313: @*/
314: PetscErrorCode PETSCVEC_DLLEXPORT VecDotBegin(Vec x,Vec y,PetscScalar *result)
315: {
316:   PetscErrorCode      ierr;
317:   PetscSplitReduction *sr;
318:   MPI_Comm            comm;

321:   PetscObjectGetComm((PetscObject)x,&comm);
322:   PetscSplitReductionGet(comm,&sr);
323:   if (sr->state == STATE_END) {
324:     SETERRQ(PETSC_ERR_ORDER,"Called before all VecxxxEnd() called");
325:   }
326:   if (sr->numopsbegin >= sr->maxops) {
327:     PetscSplitReductionExtend(sr);
328:   }
329:   sr->reducetype[sr->numopsbegin] = REDUCE_SUM;
330:   sr->invecs[sr->numopsbegin]     = (void*)x;
331:   if (!x->ops->dot_local) SETERRQ(PETSC_ERR_SUP,"Vector does not suppport local dots");
332:   PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
333:   (*x->ops->dot_local)(x,y,sr->lvalues+sr->numopsbegin++);
334:   PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);
335:   return(0);
336: }

340: /*@
341:    VecDotEnd - Ends a split phase dot product computation.

343:    Input Parameters:
344: +  x - the first vector (can be PETSC_NULL)
345: .  y - the second vector (can be PETSC_NULL)
346: -  result - where the result will go

348:    Level: advanced

350:    Notes:
351:    Each call to VecDotBegin() should be paired with a call to VecDotEnd().

353: seealso: VecDotBegin(), VecNormBegin(), VecNormEnd(), VecNorm(), VecDot(), VecMDot(), 
354:          VecTDotBegin(),VecTDotEnd()

356: @*/
357: PetscErrorCode PETSCVEC_DLLEXPORT VecDotEnd(Vec x,Vec y,PetscScalar *result)
358: {
359:   PetscErrorCode      ierr;
360:   PetscSplitReduction *sr;
361:   MPI_Comm            comm;

364:   PetscObjectGetComm((PetscObject)x,&comm);
365:   PetscSplitReductionGet(comm,&sr);
366: 
367:   if (sr->state != STATE_END) {
368:     /* this is the first call to VecxxxEnd() so do the communication */
369:     PetscSplitReductionApply(sr);
370:   }

372:   if (sr->numopsend >= sr->numopsbegin) {
373:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Called VecxxxEnd() more times then VecxxxBegin()");
374:   }
375:   if (x && (void*) x != sr->invecs[sr->numopsend]) {
376:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Called VecxxxEnd() in a different order or with a different vector than VecxxxBegin()");
377:   }
378:   if (sr->reducetype[sr->numopsend] != REDUCE_SUM) {
379:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Called VecDotEnd() on a reduction started with VecNormBegin()");
380:   }
381:   *result = sr->gvalues[sr->numopsend++];

383:   /*
384:      We are finished getting all the results so reset to no outstanding requests
385:   */
386:   if (sr->numopsend == sr->numopsbegin) {
387:     sr->state        = STATE_BEGIN;
388:     sr->numopsend    = 0;
389:     sr->numopsbegin  = 0;
390:   }
391:   return(0);
392: }

396: /*@
397:    VecTDotBegin - Starts a split phase transpose dot product computation.

399:    Input Parameters:
400: +  x - the first vector
401: .  y - the second vector
402: -  result - where the result will go (can be PETSC_NULL)

404:    Level: advanced

406:    Notes:
407:    Each call to VecTDotBegin() should be paired with a call to VecTDotEnd().

409: seealso: VecTDotEnd(), VecNormBegin(), VecNormEnd(), VecNorm(), VecDot(), VecMDot(), 
410:          VecDotBegin(), VecDotEnd()

412: @*/
413: PetscErrorCode PETSCVEC_DLLEXPORT VecTDotBegin(Vec x,Vec y,PetscScalar *result)
414: {
415:   PetscErrorCode      ierr;
416:   PetscSplitReduction *sr;
417:   MPI_Comm            comm;

420:   PetscObjectGetComm((PetscObject)x,&comm);
421:   PetscSplitReductionGet(comm,&sr);
422:   if (sr->state == STATE_END) {
423:     SETERRQ(PETSC_ERR_ORDER,"Called before all VecxxxEnd() called");
424:   }
425:   if (sr->numopsbegin >= sr->maxops) {
426:     PetscSplitReductionExtend(sr);
427:   }
428:   sr->reducetype[sr->numopsbegin] = REDUCE_SUM;
429:   sr->invecs[sr->numopsbegin]     = (void*)x;
430:   if (!x->ops->tdot_local) SETERRQ(PETSC_ERR_SUP,"Vector does not suppport local dots");
431:   PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
432:   (*x->ops->dot_local)(x,y,sr->lvalues+sr->numopsbegin++);
433:   PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);
434:   return(0);
435: }

439: /*@
440:    VecTDotEnd - Ends a split phase transpose dot product computation.

442:    Input Parameters:
443: +  x - the first vector (can be PETSC_NULL)
444: .  y - the second vector (can be PETSC_NULL)
445: -  result - where the result will go

447:    Level: advanced

449:    Notes:
450:    Each call to VecTDotBegin() should be paired with a call to VecTDotEnd().

452: seealso: VecTDotBegin(), VecNormBegin(), VecNormEnd(), VecNorm(), VecDot(), VecMDot(), 
453:          VecDotBegin(), VecDotEnd()
454: @*/
455: PetscErrorCode PETSCVEC_DLLEXPORT VecTDotEnd(Vec x,Vec y,PetscScalar *result)
456: {

460:   /*
461:       TDotEnd() is the same as DotEnd() so reuse the code
462:   */
463:   VecDotEnd(x,y,result);
464:   return(0);
465: }

467: /* -------------------------------------------------------------------------*/

471: /*@
472:    VecNormBegin - Starts a split phase norm computation.

474:    Input Parameters:
475: +  x - the first vector
476: .  ntype - norm type, one of NORM_1, NORM_2, NORM_MAX, NORM_1_AND_2
477: -  result - where the result will go (can be PETSC_NULL)

479:    Level: advanced

481:    Notes:
482:    Each call to VecNormBegin() should be paired with a call to VecNormEnd().

484: .seealso: VecNormEnd(), VecNorm(), VecDot(), VecMDot(), VecDotBegin(), VecDotEnd()

486: @*/
487: PetscErrorCode PETSCVEC_DLLEXPORT VecNormBegin(Vec x,NormType ntype,PetscReal *result)
488: {
489:   PetscErrorCode      ierr;
490:   PetscSplitReduction *sr;
491:   PetscReal           lresult[2];
492:   MPI_Comm            comm;

495:   PetscObjectGetComm((PetscObject)x,&comm);
496:   PetscSplitReductionGet(comm,&sr);
497:   if (sr->state == STATE_END) {
498:     SETERRQ(PETSC_ERR_ORDER,"Called before all VecxxxEnd() called");
499:   }
500:   if (sr->numopsbegin >= sr->maxops || (sr->numopsbegin == sr->maxops-1 && ntype == NORM_1_AND_2)) {
501:     PetscSplitReductionExtend(sr);
502:   }
503: 
504:   sr->invecs[sr->numopsbegin]     = (void*)x;
505:   if (!x->ops->norm_local) SETERRQ(PETSC_ERR_SUP,"Vector does not support local norms");
506:   PetscLogEventBegin(VEC_ReduceArithmetic,0,0,0,0);
507:   (*x->ops->norm_local)(x,ntype,lresult);
508:   PetscLogEventEnd(VEC_ReduceArithmetic,0,0,0,0);
509:   if (ntype == NORM_2)         lresult[0]                = lresult[0]*lresult[0];
510:   if (ntype == NORM_1_AND_2)   lresult[1]                = lresult[1]*lresult[1];
511:   if (ntype == NORM_MAX) sr->reducetype[sr->numopsbegin] = REDUCE_MAX;
512:   else                   sr->reducetype[sr->numopsbegin] = REDUCE_SUM;
513:   sr->lvalues[sr->numopsbegin++] = lresult[0];
514:   if (ntype == NORM_1_AND_2) {
515:     sr->reducetype[sr->numopsbegin] = REDUCE_SUM;
516:     sr->lvalues[sr->numopsbegin++]  = lresult[1];
517:   }
518:   return(0);
519: }

523: /*@
524:    VecNormEnd - Ends a split phase norm computation.

526:    Input Parameters:
527: +  x - the first vector (can be PETSC_NULL)
528: .  ntype - norm type, one of NORM_1, NORM_2, NORM_MAX, NORM_1_AND_2
529: -  result - where the result will go

531:    Level: advanced

533:    Notes:
534:    Each call to VecNormBegin() should be paired with a call to VecNormEnd().

536: .seealso: VecNormBegin(), VecNorm(), VecDot(), VecMDot(), VecDotBegin(), VecDotEnd()

538: @*/
539: PetscErrorCode PETSCVEC_DLLEXPORT VecNormEnd(Vec x,NormType ntype,PetscReal *result)
540: {
541:   PetscErrorCode      ierr;
542:   PetscInt            type_id;
543:   PetscSplitReduction *sr;
544:   MPI_Comm            comm;

547:   VecNormComposedDataID(ntype,&type_id);

549:   PetscObjectGetComm((PetscObject)x,&comm);
550:   PetscSplitReductionGet(comm,&sr);
551: 
552:   if (sr->state != STATE_END) {
553:     /* this is the first call to VecxxxEnd() so do the communication */
554:     PetscSplitReductionApply(sr);
555:   }

557:   if (sr->numopsend >= sr->numopsbegin) {
558:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Called VecxxxEnd() more times then VecxxxBegin()");
559:   }
560:   if (x && (void*)x != sr->invecs[sr->numopsend]) {
561:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Called VecxxxEnd() in a different order or with a different vector than VecxxxBegin()");
562:   }
563:   if (sr->reducetype[sr->numopsend] != REDUCE_MAX && ntype == NORM_MAX) {
564:     SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Called VecNormEnd(,NORM_MAX,) on a reduction started with VecDotBegin() or NORM_1 or NORM_2");
565:   }
566:   result[0] = PetscRealPart(sr->gvalues[sr->numopsend++]);

568:   if (ntype == NORM_2) {
569:     result[0] = sqrt(result[0]);
570:   } else if (ntype == NORM_1_AND_2) {
571:     result[1] = PetscRealPart(sr->gvalues[sr->numopsend++]);
572:     result[1] = sqrt(result[1]);
573:   }
574:   if (ntype!=NORM_1_AND_2) {
575:     PetscObjectComposedDataSetReal((PetscObject)x,type_id,result[0]);
576:   }

578:   if (sr->numopsend == sr->numopsbegin) {
579:     sr->state        = STATE_BEGIN;
580:     sr->numopsend    = 0;
581:     sr->numopsbegin  = 0;
582:   }
583:   return(0);
584: }

586: /*
587:    Possibly add

589:      PetscReductionSumBegin/End()
590:      PetscReductionMaxBegin/End()
591:      PetscReductionMinBegin/End()
592:    or have more like MPI with a single function with flag for Op? Like first better
593: */