Actual source code: mpiov.c
1: #define PETSCMAT_DLL
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/aij/mpi/mpiaij.h
8: #include petscbt.h
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
13: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
18: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19: {
21: PetscInt i;
24: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
27: }
28: return(0);
29: }
31: /*
32: Sample message format:
33: If a processor A wants processor B to process some elements corresponding
34: to index sets is[1],is[5]
35: mesg [0] = 2 (no of index sets in the mesg)
36: -----------
37: mesg [1] = 1 => is[1]
38: mesg [2] = sizeof(is[1]);
39: -----------
40: mesg [3] = 5 => is[5]
41: mesg [4] = sizeof(is[5]);
42: -----------
43: mesg [5]
44: mesg [n] datas[1]
45: -----------
46: mesg[n+1]
47: mesg[m] data(is[5])
48: -----------
49:
50: Notes:
51: nrqs - no of requests sent (or to be sent out)
52: nrqr - no of requests recieved (which have to be or which have been processed
53: */
56: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
57: {
58: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
59: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
60: PetscInt **idx,*n,*rtable,**data,len,*idx_i;
62: PetscMPIInt size,rank,tag1,tag2;
63: PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
64: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
65: PetscBT *table;
66: MPI_Comm comm;
67: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68: MPI_Status *s_status,*recv_status;
71: comm = C->comm;
72: size = c->size;
73: rank = c->rank;
74: m = C->M;
76: PetscObjectGetNewTag((PetscObject)C,&tag1);
77: PetscObjectGetNewTag((PetscObject)C,&tag2);
78:
79: len = (imax+1)*sizeof(PetscInt*)+ (imax + m)*sizeof(PetscInt);
80: PetscMalloc(len,&idx);
81: n = (PetscInt*)(idx + imax);
82: rtable = n + imax;
83:
84: for (i=0; i<imax; i++) {
85: ISGetIndices(is[i],&idx[i]);
86: ISGetLocalSize(is[i],&n[i]);
87: }
88:
89: /* Create hash table for the mapping :row -> proc*/
90: for (i=0,j=0; i<size; i++) {
91: len = c->rowners[i+1];
92: for (; j<len; j++) {
93: rtable[j] = i;
94: }
95: }
97: /* evaluate communication - mesg to who,length of mesg, and buffer space
98: required. Based on this, buffers are allocated, and data copied into them*/
99: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1);/* mesg size */
100: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
101: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
102: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
103: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialise work vector*/
104: for (i=0; i<imax; i++) {
105: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
106: idx_i = idx[i];
107: len = n[i];
108: for (j=0; j<len; j++) {
109: row = idx_i[j];
110: if (row < 0) {
111: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
112: }
113: proc = rtable[row];
114: w4[proc]++;
115: }
116: for (j=0; j<size; j++){
117: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
118: }
119: }
121: nrqs = 0; /* no of outgoing messages */
122: msz = 0; /* total mesg length (for all proc */
123: w1[rank] = 0; /* no mesg sent to intself */
124: w3[rank] = 0;
125: for (i=0; i<size; i++) {
126: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
127: }
128: /* pa - is list of processors to communicate with */
129: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
130: for (i=0,j=0; i<size; i++) {
131: if (w1[i]) {pa[j] = i; j++;}
132: }
134: /* Each message would have a header = 1 + 2*(no of IS) + data */
135: for (i=0; i<nrqs; i++) {
136: j = pa[i];
137: w1[j] += w2[j] + 2*w3[j];
138: msz += w1[j];
139: }
141: /* Determine the number of messages to expect, their lengths, from from-ids */
142: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
143: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
145: /* Now post the Irecvs corresponding to these messages */
146: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
148: /* Allocate Memory for outgoing messages */
149: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
150: PetscMalloc(len,&outdat);
151: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
152: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
153: tmp = (PetscInt*)(outdat + 2*size);
154: ctr = tmp + msz;
156: {
157: PetscInt *iptr = tmp,ict = 0;
158: for (i=0; i<nrqs; i++) {
159: j = pa[i];
160: iptr += ict;
161: outdat[j] = iptr;
162: ict = w1[j];
163: }
164: }
166: /* Form the outgoing messages */
167: /*plug in the headers*/
168: for (i=0; i<nrqs; i++) {
169: j = pa[i];
170: outdat[j][0] = 0;
171: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
172: ptr[j] = outdat[j] + 2*w3[j] + 1;
173: }
174:
175: /* Memory for doing local proc's work*/
176: {
177: PetscInt *d_p;
178: char *t_p;
180: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
181: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
182: PetscMalloc(len,&table);
183: PetscMemzero(table,len);
184: data = (PetscInt **)(table + imax);
185: isz = (PetscInt *)(data + imax);
186: d_p = (PetscInt *)(isz + imax);
187: t_p = (char *)(d_p + m*imax);
188: for (i=0; i<imax; i++) {
189: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
190: data[i] = d_p + (m)*i;
191: }
192: }
194: /* Parse the IS and update local tables and the outgoing buf with the data*/
195: {
196: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
197: PetscBT table_i;
199: for (i=0; i<imax; i++) {
200: PetscMemzero(ctr,size*sizeof(PetscInt));
201: n_i = n[i];
202: table_i = table[i];
203: idx_i = idx[i];
204: data_i = data[i];
205: isz_i = isz[i];
206: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
207: row = idx_i[j];
208: proc = rtable[row];
209: if (proc != rank) { /* copy to the outgoing buffer */
210: ctr[proc]++;
211: *ptr[proc] = row;
212: ptr[proc]++;
213: } else { /* Update the local table */
214: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
215: }
216: }
217: /* Update the headers for the current IS */
218: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
219: if ((ctr_j = ctr[j])) {
220: outdat_j = outdat[j];
221: k = ++outdat_j[0];
222: outdat_j[2*k] = ctr_j;
223: outdat_j[2*k-1] = i;
224: }
225: }
226: isz[i] = isz_i;
227: }
228: }
229:
232: /* Now post the sends */
233: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
234: for (i=0; i<nrqs; ++i) {
235: j = pa[i];
236: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
237: }
238:
239: /* No longer need the original indices*/
240: for (i=0; i<imax; ++i) {
241: ISRestoreIndices(is[i],idx+i);
242: }
243: PetscFree(idx);
245: for (i=0; i<imax; ++i) {
246: ISDestroy(is[i]);
247: }
248:
249: /* Do Local work*/
250: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
252: /* Receive messages*/
253: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
254: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
255:
256: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
257: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
259: /* Phase 1 sends are complete - deallocate buffers */
260: PetscFree(outdat);
261: PetscFree(w1);
263: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
264: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
265: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
266: PetscFree(rbuf);
268:
269: /* Send the data back*/
270: /* Do a global reduction to know the buffer space req for incoming messages*/
271: {
272: PetscMPIInt *rw1;
273:
274: PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
275: PetscMemzero(rw1,size*sizeof(PetscMPIInt));
277: for (i=0; i<nrqr; ++i) {
278: proc = recv_status[i].MPI_SOURCE;
279: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280: rw1[proc] = isz1[i];
281: }
282: PetscFree(onodes1);
283: PetscFree(olengths1);
285: /* Determine the number of messages to expect, their lengths, from from-ids */
286: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
287: PetscFree(rw1);
288: }
289: /* Now post the Irecvs corresponding to these messages */
290: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
292: /* Now post the sends */
293: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
294: for (i=0; i<nrqr; ++i) {
295: j = recv_status[i].MPI_SOURCE;
296: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
297: }
299: /* receive work done on other processors*/
300: {
301: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
302: PetscMPIInt idex;
303: PetscBT table_i;
304: MPI_Status *status2;
305:
306: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
307: for (i=0; i<nrqs; ++i) {
308: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
309: /* Process the message*/
310: rbuf2_i = rbuf2[idex];
311: ct1 = 2*rbuf2_i[0]+1;
312: jmax = rbuf2[idex][0];
313: for (j=1; j<=jmax; j++) {
314: max = rbuf2_i[2*j];
315: is_no = rbuf2_i[2*j-1];
316: isz_i = isz[is_no];
317: data_i = data[is_no];
318: table_i = table[is_no];
319: for (k=0; k<max; k++,ct1++) {
320: row = rbuf2_i[ct1];
321: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
322: }
323: isz[is_no] = isz_i;
324: }
325: }
327: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
328: PetscFree(status2);
329: }
330:
331: for (i=0; i<imax; ++i) {
332: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
333: }
334:
335: PetscFree(onodes2);
336: PetscFree(olengths2);
338: PetscFree(pa);
339: PetscFree(rbuf2);
340: PetscFree(s_waits1);
341: PetscFree(r_waits1);
342: PetscFree(s_waits2);
343: PetscFree(r_waits2);
344: PetscFree(table);
345: PetscFree(s_status);
346: PetscFree(recv_status);
347: PetscFree(xdata[0]);
348: PetscFree(xdata);
349: PetscFree(isz1);
350: return(0);
351: }
355: /*
356: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
357: the work on the local processor.
359: Inputs:
360: C - MAT_MPIAIJ;
361: imax - total no of index sets processed at a time;
362: table - an array of char - size = m bits.
363:
364: Output:
365: isz - array containing the count of the solution elements correspondign
366: to each index set;
367: data - pointer to the solutions
368: */
369: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
370: {
371: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
372: Mat A = c->A,B = c->B;
373: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
374: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
375: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
376: PetscBT table_i;
379: rstart = c->rstart;
380: cstart = c->cstart;
381: ai = a->i;
382: aj = a->j;
383: bi = b->i;
384: bj = b->j;
385: garray = c->garray;
387:
388: for (i=0; i<imax; i++) {
389: data_i = data[i];
390: table_i = table[i];
391: isz_i = isz[i];
392: for (j=0,max=isz[i]; j<max; j++) {
393: row = data_i[j] - rstart;
394: start = ai[row];
395: end = ai[row+1];
396: for (k=start; k<end; k++) { /* Amat */
397: val = aj[k] + cstart;
398: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399: }
400: start = bi[row];
401: end = bi[row+1];
402: for (k=start; k<end; k++) { /* Bmat */
403: val = garray[bj[k]];
404: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
405: }
406: }
407: isz[i] = isz_i;
408: }
409: return(0);
410: }
414: /*
415: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
416: and return the output
418: Input:
419: C - the matrix
420: nrqr - no of messages being processed.
421: rbuf - an array of pointers to the recieved requests
422:
423: Output:
424: xdata - array of messages to be sent back
425: isz1 - size of each message
427: For better efficiency perhaps we should malloc seperately each xdata[i],
428: then if a remalloc is required we need only copy the data for that one row
429: rather then all previous rows as it is now where a single large chunck of
430: memory is used.
432: */
433: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
434: {
435: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
436: Mat A = c->A,B = c->B;
437: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
439: PetscMPIInt rank;
440: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
441: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
442: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
443: PetscInt *rbuf_i,kmax,rbuf_0;
444: PetscBT xtable;
447: rank = c->rank;
448: m = C->M;
449: rstart = c->rstart;
450: cstart = c->cstart;
451: ai = a->i;
452: aj = a->j;
453: bi = b->i;
454: bj = b->j;
455: garray = c->garray;
456:
457:
458: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
459: rbuf_i = rbuf[i];
460: rbuf_0 = rbuf_i[0];
461: ct += rbuf_0;
462: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
463: }
464:
465: if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
466: else max1 = 1;
467: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
468: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
469: ++no_malloc;
470: PetscBTCreate(m,xtable);
471: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
472:
473: ct3 = 0;
474: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
475: rbuf_i = rbuf[i];
476: rbuf_0 = rbuf_i[0];
477: ct1 = 2*rbuf_0+1;
478: ct2 = ct1;
479: ct3 += ct1;
480: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
481: PetscBTMemzero(m,xtable);
482: oct2 = ct2;
483: kmax = rbuf_i[2*j];
484: for (k=0; k<kmax; k++,ct1++) {
485: row = rbuf_i[ct1];
486: if (!PetscBTLookupSet(xtable,row)) {
487: if (!(ct3 < mem_estimate)) {
488: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
489: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
490: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
491: PetscFree(xdata[0]);
492: xdata[0] = tmp;
493: mem_estimate = new_estimate; ++no_malloc;
494: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
495: }
496: xdata[i][ct2++] = row;
497: ct3++;
498: }
499: }
500: for (k=oct2,max2=ct2; k<max2; k++) {
501: row = xdata[i][k] - rstart;
502: start = ai[row];
503: end = ai[row+1];
504: for (l=start; l<end; l++) {
505: val = aj[l] + cstart;
506: if (!PetscBTLookupSet(xtable,val)) {
507: if (!(ct3 < mem_estimate)) {
508: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
509: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
510: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
511: PetscFree(xdata[0]);
512: xdata[0] = tmp;
513: mem_estimate = new_estimate; ++no_malloc;
514: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
515: }
516: xdata[i][ct2++] = val;
517: ct3++;
518: }
519: }
520: start = bi[row];
521: end = bi[row+1];
522: for (l=start; l<end; l++) {
523: val = garray[bj[l]];
524: if (!PetscBTLookupSet(xtable,val)) {
525: if (!(ct3 < mem_estimate)) {
526: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
527: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
528: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
529: PetscFree(xdata[0]);
530: xdata[0] = tmp;
531: mem_estimate = new_estimate; ++no_malloc;
532: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
533: }
534: xdata[i][ct2++] = val;
535: ct3++;
536: }
537: }
538: }
539: /* Update the header*/
540: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
541: xdata[i][2*j-1] = rbuf_i[2*j-1];
542: }
543: xdata[i][0] = rbuf_0;
544: xdata[i+1] = xdata[i] + ct2;
545: isz1[i] = ct2; /* size of each message */
546: }
547: PetscBTDestroy(xtable);
548: PetscLogInfo((0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate, ct3,no_malloc));
549: return(0);
550: }
551: /* -------------------------------------------------------------------------*/
552: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
553: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
554: /*
555: Every processor gets the entire matrix
556: */
559: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
560: {
561: Mat B;
562: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
563: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
565: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
566: PetscInt sendcount,i,*rstarts = a->rowners,n,cnt,j;
567: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
568: PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
571: MPI_Comm_size(A->comm,&size);
572: MPI_Comm_rank(A->comm,&rank);
574: if (scall == MAT_INITIAL_MATRIX) {
575: /* ----------------------------------------------------------------
576: Tell every processor the number of nonzeros per row
577: */
578: PetscMalloc(A->M*sizeof(PetscInt),&lens);
579: for (i=a->rstart; i<a->rend; i++) {
580: lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
581: }
582: sendcount = a->rend - a->rstart;
583: PetscMalloc(2*size*sizeof(PetscMPIInt),&recvcounts);
584: displs = recvcounts + size;
585: for (i=0; i<size; i++) {
586: recvcounts[i] = a->rowners[i+1] - a->rowners[i];
587: displs[i] = a->rowners[i];
588: }
589: MPI_Allgatherv(lens+a->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,A->comm);
591: /* ---------------------------------------------------------------
592: Create the sequential matrix of the same type as the local block diagonal
593: */
594: MatCreate(PETSC_COMM_SELF,&B);
595: MatSetSizes(B,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE);
596: MatSetType(B,a->A->type_name);
597: MatSeqAIJSetPreallocation(B,0,lens);
598: PetscMalloc(sizeof(Mat),Bin);
599: **Bin = B;
600: b = (Mat_SeqAIJ *)B->data;
602: /*--------------------------------------------------------------------
603: Copy my part of matrix column indices over
604: */
605: sendcount = ad->nz + bd->nz;
606: jsendbuf = b->j + b->i[rstarts[rank]];
607: a_jsendbuf = ad->j;
608: b_jsendbuf = bd->j;
609: n = a->rend - a->rstart;
610: cnt = 0;
611: for (i=0; i<n; i++) {
613: /* put in lower diagonal portion */
614: m = bd->i[i+1] - bd->i[i];
615: while (m > 0) {
616: /* is it above diagonal (in bd (compressed) numbering) */
617: if (garray[*b_jsendbuf] > a->rstart + i) break;
618: jsendbuf[cnt++] = garray[*b_jsendbuf++];
619: m--;
620: }
622: /* put in diagonal portion */
623: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
624: jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
625: }
627: /* put in upper diagonal portion */
628: while (m-- > 0) {
629: jsendbuf[cnt++] = garray[*b_jsendbuf++];
630: }
631: }
632: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
634: /*--------------------------------------------------------------------
635: Gather all column indices to all processors
636: */
637: for (i=0; i<size; i++) {
638: recvcounts[i] = 0;
639: for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
640: recvcounts[i] += lens[j];
641: }
642: }
643: displs[0] = 0;
644: for (i=1; i<size; i++) {
645: displs[i] = displs[i-1] + recvcounts[i-1];
646: }
647: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,A->comm);
649: /*--------------------------------------------------------------------
650: Assemble the matrix into useable form (note numerical values not yet set)
651: */
652: /* set the b->ilen (length of each row) values */
653: PetscMemcpy(b->ilen,lens,A->M*sizeof(PetscInt));
654: /* set the b->i indices */
655: b->i[0] = 0;
656: for (i=1; i<=A->M; i++) {
657: b->i[i] = b->i[i-1] + lens[i-1];
658: }
659: PetscFree(lens);
660: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
661: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
663: } else {
664: B = **Bin;
665: b = (Mat_SeqAIJ *)B->data;
666: }
668: /*--------------------------------------------------------------------
669: Copy my part of matrix numerical values into the values location
670: */
671: sendcount = ad->nz + bd->nz;
672: sendbuf = b->a + b->i[rstarts[rank]];
673: a_sendbuf = ad->a;
674: b_sendbuf = bd->a;
675: b_sendj = bd->j;
676: n = a->rend - a->rstart;
677: cnt = 0;
678: for (i=0; i<n; i++) {
680: /* put in lower diagonal portion */
681: m = bd->i[i+1] - bd->i[i];
682: while (m > 0) {
683: /* is it above diagonal (in bd (compressed) numbering) */
684: if (garray[*b_sendj] > a->rstart + i) break;
685: sendbuf[cnt++] = *b_sendbuf++;
686: m--;
687: b_sendj++;
688: }
690: /* put in diagonal portion */
691: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692: sendbuf[cnt++] = *a_sendbuf++;
693: }
695: /* put in upper diagonal portion */
696: while (m-- > 0) {
697: sendbuf[cnt++] = *b_sendbuf++;
698: b_sendj++;
699: }
700: }
701: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702:
703: /* -----------------------------------------------------------------
704: Gather all numerical values to all processors
705: */
706: if (!recvcounts) {
707: PetscMalloc(2*size*sizeof(PetscInt),&recvcounts);
708: displs = recvcounts + size;
709: }
710: for (i=0; i<size; i++) {
711: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
712: }
713: displs[0] = 0;
714: for (i=1; i<size; i++) {
715: displs[i] = displs[i-1] + recvcounts[i-1];
716: }
717: recvbuf = b->a;
718: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);
719: PetscFree(recvcounts);
720: if (A->symmetric){
721: MatSetOption(B,MAT_SYMMETRIC);
722: } else if (A->hermitian) {
723: MatSetOption(B,MAT_HERMITIAN);
724: } else if (A->structurally_symmetric) {
725: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC);
726: }
728: return(0);
729: }
733: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
734: {
736: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
737: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
740: /*
741: Check for special case each processor gets entire matrix
742: */
743: if (ismax == 1 && C->M == C->N) {
744: ISIdentity(*isrow,&rowflag);
745: ISIdentity(*iscol,&colflag);
746: ISGetLocalSize(*isrow,&nrow);
747: ISGetLocalSize(*iscol,&ncol);
748: if (rowflag && colflag && nrow == C->M && ncol == C->N) {
749: wantallmatrix = PETSC_TRUE;
750: PetscOptionsGetTruth(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
751: }
752: }
753: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);
754: if (twantallmatrix) {
755: MatGetSubMatrix_MPIAIJ_All(C,scall,submat);
756: return(0);
757: }
759: /* Allocate memory to hold all the submatrices */
760: if (scall != MAT_REUSE_MATRIX) {
761: PetscMalloc((ismax+1)*sizeof(Mat),submat);
762: }
763: /* Determine the number of stages through which submatrices are done */
764: nmax = 20*1000000 / (C->N * sizeof(PetscInt));
765: if (!nmax) nmax = 1;
766: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
768: /* Make sure every processor loops through the nstages */
769: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,C->comm);
771: for (i=0,pos=0; i<nstages; i++) {
772: if (pos+nmax <= ismax) max_no = nmax;
773: else if (pos == ismax) max_no = 0;
774: else max_no = ismax-pos;
775: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
776: pos += max_no;
777: }
778: return(0);
779: }
780: /* -------------------------------------------------------------------------*/
783: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
784: {
785: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
786: Mat A = c->A;
787: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
788: PetscInt **irow,**icol,*nrow,*ncol,start;
790: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
791: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
792: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
793: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
794: PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
795: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
796: PetscInt *rmap_i;
797: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
798: MPI_Request *r_waits4,*s_waits3,*s_waits4;
799: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
800: MPI_Status *r_status3,*r_status4,*s_status4;
801: MPI_Comm comm;
802: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
803: PetscTruth sorted;
804: PetscMPIInt *onodes1,*olengths1;
805: PetscMPIInt idex,idex2,end;
808: comm = C->comm;
809: tag0 = C->tag;
810: size = c->size;
811: rank = c->rank;
812:
813: /* Get some new tags to keep the communication clean */
814: PetscObjectGetNewTag((PetscObject)C,&tag1);
815: PetscObjectGetNewTag((PetscObject)C,&tag2);
816: PetscObjectGetNewTag((PetscObject)C,&tag3);
818: /* Check if the col indices are sorted */
819: for (i=0; i<ismax; i++) {
820: ISSorted(isrow[i],&sorted);
821: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
822: ISSorted(iscol[i],&sorted);
823: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
824: }
826: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
827: PetscMalloc(len,&irow);
828: icol = irow + ismax;
829: nrow = (PetscInt*)(icol + ismax);
830: ncol = nrow + ismax;
832: for (i=0; i<ismax; i++) {
833: ISGetIndices(isrow[i],&irow[i]);
834: ISGetIndices(iscol[i],&icol[i]);
835: ISGetLocalSize(isrow[i],&nrow[i]);
836: ISGetLocalSize(iscol[i],&ncol[i]);
837: }
839: /* evaluate communication - mesg to who, length of mesg, and buffer space
840: required. Based on this, buffers are allocated, and data copied into them*/
841: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1); /* mesg size */
842: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
843: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
844: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
845: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialize work vector*/
846: for (i=0; i<ismax; i++) {
847: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
848: jmax = nrow[i];
849: irow_i = irow[i];
850: for (l=0,j=0; j<jmax; j++) {
851: row = irow_i[j];
852: while (row >= c->rowners[l+1]) l++;
853: proc = l;
854: w4[proc]++;
855: }
856: for (j=0; j<size; j++) {
857: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
858: }
859: }
860:
861: nrqs = 0; /* no of outgoing messages */
862: msz = 0; /* total mesg length (for all procs) */
863: w1[rank] = 0; /* no mesg sent to self */
864: w3[rank] = 0;
865: for (i=0; i<size; i++) {
866: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
867: }
868: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
869: for (i=0,j=0; i<size; i++) {
870: if (w1[i]) { pa[j] = i; j++; }
871: }
873: /* Each message would have a header = 1 + 2*(no of IS) + data */
874: for (i=0; i<nrqs; i++) {
875: j = pa[i];
876: w1[j] += w2[j] + 2* w3[j];
877: msz += w1[j];
878: }
880: /* Determine the number of messages to expect, their lengths, from from-ids */
881: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
882: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
884: /* Now post the Irecvs corresponding to these messages */
885: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
886:
887: PetscFree(onodes1);
888: PetscFree(olengths1);
889:
890: /* Allocate Memory for outgoing messages */
891: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
892: PetscMalloc(len,&sbuf1);
893: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
894: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
895: /* allocate memory for outgoing data + buf to receive the first reply */
896: tmp = (PetscInt*)(ptr + size);
897: ctr = tmp + 2*msz;
899: {
900: PetscInt *iptr = tmp,ict = 0;
901: for (i=0; i<nrqs; i++) {
902: j = pa[i];
903: iptr += ict;
904: sbuf1[j] = iptr;
905: ict = w1[j];
906: }
907: }
909: /* Form the outgoing messages */
910: /* Initialize the header space */
911: for (i=0; i<nrqs; i++) {
912: j = pa[i];
913: sbuf1[j][0] = 0;
914: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
915: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
916: }
917:
918: /* Parse the isrow and copy data into outbuf */
919: for (i=0; i<ismax; i++) {
920: PetscMemzero(ctr,size*sizeof(PetscInt));
921: irow_i = irow[i];
922: jmax = nrow[i];
923: for (l=0,j=0; j<jmax; j++) { /* parse the indices of each IS */
924: row = irow_i[j];
925: while (row >= c->rowners[l+1]) l++;
926: proc = l;
927: if (proc != rank) { /* copy to the outgoing buf*/
928: ctr[proc]++;
929: *ptr[proc] = row;
930: ptr[proc]++;
931: }
932: }
933: /* Update the headers for the current IS */
934: for (j=0; j<size; j++) { /* Can Optimise this loop too */
935: if ((ctr_j = ctr[j])) {
936: sbuf1_j = sbuf1[j];
937: k = ++sbuf1_j[0];
938: sbuf1_j[2*k] = ctr_j;
939: sbuf1_j[2*k-1] = i;
940: }
941: }
942: }
944: /* Now post the sends */
945: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
946: for (i=0; i<nrqs; ++i) {
947: j = pa[i];
948: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
949: }
951: /* Post Receives to capture the buffer size */
952: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
953: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
954: rbuf2[0] = tmp + msz;
955: for (i=1; i<nrqs; ++i) {
956: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
957: }
958: for (i=0; i<nrqs; ++i) {
959: j = pa[i];
960: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
961: }
963: /* Send to other procs the buf size they should allocate */
964:
966: /* Receive messages*/
967: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
968: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
969: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
970: PetscMalloc(len,&sbuf2);
971: req_size = (PetscInt*)(sbuf2 + nrqr);
972: req_source = req_size + nrqr;
973:
974: {
975: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
976: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
977: PetscInt *sbuf2_i;
979: for (i=0; i<nrqr; ++i) {
980: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
981: req_size[idex] = 0;
982: rbuf1_i = rbuf1[idex];
983: start = 2*rbuf1_i[0] + 1;
984: MPI_Get_count(r_status1+i,MPIU_INT,&end);
985: PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
986: sbuf2_i = sbuf2[idex];
987: for (j=start; j<end; j++) {
988: id = rbuf1_i[j] - rstart;
989: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
990: sbuf2_i[j] = ncols;
991: req_size[idex] += ncols;
992: }
993: req_source[idex] = r_status1[i].MPI_SOURCE;
994: /* form the header */
995: sbuf2_i[0] = req_size[idex];
996: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
997: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
998: }
999: }
1000: PetscFree(r_status1);
1001: PetscFree(r_waits1);
1003: /* recv buffer sizes */
1004: /* Receive messages*/
1005:
1006: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1007: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1008: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1009: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1010: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1012: for (i=0; i<nrqs; ++i) {
1013: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1014: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1015: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1016: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1017: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1018: }
1019: PetscFree(r_status2);
1020: PetscFree(r_waits2);
1021:
1022: /* Wait on sends1 and sends2 */
1023: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1024: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1026: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1027: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1028: PetscFree(s_status1);
1029: PetscFree(s_status2);
1030: PetscFree(s_waits1);
1031: PetscFree(s_waits2);
1033: /* Now allocate buffers for a->j, and send them off */
1034: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1035: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1036: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1037: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1038:
1039: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1040: {
1041: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1042: PetscInt *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1043: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1045: for (i=0; i<nrqr; i++) {
1046: rbuf1_i = rbuf1[i];
1047: sbuf_aj_i = sbuf_aj[i];
1048: ct1 = 2*rbuf1_i[0] + 1;
1049: ct2 = 0;
1050: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1051: kmax = rbuf1[i][2*j];
1052: for (k=0; k<kmax; k++,ct1++) {
1053: row = rbuf1_i[ct1] - rstart;
1054: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1055: ncols = nzA + nzB;
1056: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1058: /* load the column indices for this row into cols*/
1059: cols = sbuf_aj_i + ct2;
1060:
1061: for (l=0; l<nzB; l++) {
1062: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
1063: else break;
1064: }
1065: imark = l;
1066: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
1067: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1069: ct2 += ncols;
1070: }
1071: }
1072: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1073: }
1074: }
1075: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1076: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1078: /* Allocate buffers for a->a, and send them off */
1079: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1080: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1081: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1082: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1083:
1084: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1085: {
1086: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1087: PetscInt cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1088: PetscInt *b_j = b->j;
1089: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1090:
1091: for (i=0; i<nrqr; i++) {
1092: rbuf1_i = rbuf1[i];
1093: sbuf_aa_i = sbuf_aa[i];
1094: ct1 = 2*rbuf1_i[0]+1;
1095: ct2 = 0;
1096: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1097: kmax = rbuf1_i[2*j];
1098: for (k=0; k<kmax; k++,ct1++) {
1099: row = rbuf1_i[ct1] - rstart;
1100: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1101: ncols = nzA + nzB;
1102: cworkB = b_j + b_i[row];
1103: vworkA = a_a + a_i[row];
1104: vworkB = b_a + b_i[row];
1106: /* load the column values for this row into vals*/
1107: vals = sbuf_aa_i+ct2;
1108:
1109: for (l=0; l<nzB; l++) {
1110: if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l];
1111: else break;
1112: }
1113: imark = l;
1114: for (l=0; l<nzA; l++) vals[imark+l] = vworkA[l];
1115: for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1116:
1117: ct2 += ncols;
1118: }
1119: }
1120: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1121: }
1122: }
1123: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1124: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1125: PetscFree(rbuf1);
1127: /* Form the matrix */
1128: /* create col map */
1129: {
1130: PetscInt *icol_i;
1131:
1132: len = (1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->N)*sizeof(PetscInt);
1133: PetscMalloc(len,&cmap);
1134: cmap[0] = (PetscInt*)(cmap + ismax);
1135: PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(PetscInt));
1136: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1137: for (i=0; i<ismax; i++) {
1138: jmax = ncol[i];
1139: icol_i = icol[i];
1140: cmap_i = cmap[i];
1141: for (j=0; j<jmax; j++) {
1142: cmap_i[icol_i[j]] = j+1;
1143: }
1144: }
1145: }
1147: /* Create lens which is required for MatCreate... */
1148: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1149: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1150: PetscMalloc(len,&lens);
1151: lens[0] = (PetscInt*)(lens + ismax);
1152: PetscMemzero(lens[0],j*sizeof(PetscInt));
1153: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1154:
1155: /* Update lens from local data */
1156: for (i=0; i<ismax; i++) {
1157: jmax = nrow[i];
1158: cmap_i = cmap[i];
1159: irow_i = irow[i];
1160: lens_i = lens[i];
1161: for (l=0,j=0; j<jmax; j++) {
1162: row = irow_i[j];
1163: while (row >= c->rowners[l+1]) l++;
1164: proc = l;
1165: if (proc == rank) {
1166: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1167: for (k=0; k<ncols; k++) {
1168: if (cmap_i[cols[k]]) { lens_i[j]++;}
1169: }
1170: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1171: }
1172: }
1173: }
1174:
1175: /* Create row map*/
1176: len = (1+ismax)*sizeof(PetscInt*)+ ismax*C->M*sizeof(PetscInt);
1177: PetscMalloc(len,&rmap);
1178: rmap[0] = (PetscInt*)(rmap + ismax);
1179: PetscMemzero(rmap[0],ismax*C->M*sizeof(PetscInt));
1180: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1181: for (i=0; i<ismax; i++) {
1182: rmap_i = rmap[i];
1183: irow_i = irow[i];
1184: jmax = nrow[i];
1185: for (j=0; j<jmax; j++) {
1186: rmap_i[irow_i[j]] = j;
1187: }
1188: }
1189:
1190: /* Update lens from offproc data */
1191: {
1192: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1194: for (tmp2=0; tmp2<nrqs; tmp2++) {
1195: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1196: idex = pa[idex2];
1197: sbuf1_i = sbuf1[idex];
1198: jmax = sbuf1_i[0];
1199: ct1 = 2*jmax+1;
1200: ct2 = 0;
1201: rbuf2_i = rbuf2[idex2];
1202: rbuf3_i = rbuf3[idex2];
1203: for (j=1; j<=jmax; j++) {
1204: is_no = sbuf1_i[2*j-1];
1205: max1 = sbuf1_i[2*j];
1206: lens_i = lens[is_no];
1207: cmap_i = cmap[is_no];
1208: rmap_i = rmap[is_no];
1209: for (k=0; k<max1; k++,ct1++) {
1210: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1211: max2 = rbuf2_i[ct1];
1212: for (l=0; l<max2; l++,ct2++) {
1213: if (cmap_i[rbuf3_i[ct2]]) {
1214: lens_i[row]++;
1215: }
1216: }
1217: }
1218: }
1219: }
1220: }
1221: PetscFree(r_status3);
1222: PetscFree(r_waits3);
1223: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1224: PetscFree(s_status3);
1225: PetscFree(s_waits3);
1227: /* Create the submatrices */
1228: if (scall == MAT_REUSE_MATRIX) {
1229: PetscTruth flag;
1231: /*
1232: Assumes new rows are same length as the old rows,hence bug!
1233: */
1234: for (i=0; i<ismax; i++) {
1235: mat = (Mat_SeqAIJ *)(submats[i]->data);
1236: if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1237: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1238: }
1239: PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(PetscInt),&flag);
1240: if (!flag) {
1241: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1242: }
1243: /* Initial matrix as if empty */
1244: PetscMemzero(mat->ilen,submats[i]->m*sizeof(PetscInt));
1245: submats[i]->factor = C->factor;
1246: }
1247: } else {
1248: for (i=0; i<ismax; i++) {
1249: MatCreate(PETSC_COMM_SELF,submats+i);
1250: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1251: MatSetType(submats[i],A->type_name);
1252: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1253: }
1254: }
1256: /* Assemble the matrices */
1257: /* First assemble the local rows */
1258: {
1259: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1260: PetscScalar *imat_a;
1261:
1262: for (i=0; i<ismax; i++) {
1263: mat = (Mat_SeqAIJ*)submats[i]->data;
1264: imat_ilen = mat->ilen;
1265: imat_j = mat->j;
1266: imat_i = mat->i;
1267: imat_a = mat->a;
1268: cmap_i = cmap[i];
1269: rmap_i = rmap[i];
1270: irow_i = irow[i];
1271: jmax = nrow[i];
1272: for (l=0,j=0; j<jmax; j++) {
1273: row = irow_i[j];
1274: while (row >= c->rowners[l+1]) l++;
1275: proc = l;
1276: if (proc == rank) {
1277: old_row = row;
1278: row = rmap_i[row];
1279: ilen_row = imat_ilen[row];
1280: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1281: mat_i = imat_i[row] ;
1282: mat_a = imat_a + mat_i;
1283: mat_j = imat_j + mat_i;
1284: for (k=0; k<ncols; k++) {
1285: if ((tcol = cmap_i[cols[k]])) {
1286: *mat_j++ = tcol - 1;
1287: *mat_a++ = vals[k];
1288: ilen_row++;
1289: }
1290: }
1291: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1292: imat_ilen[row] = ilen_row;
1293: }
1294: }
1295: }
1296: }
1298: /* Now assemble the off proc rows*/
1299: {
1300: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1301: PetscInt *imat_j,*imat_i;
1302: PetscScalar *imat_a,*rbuf4_i;
1304: for (tmp2=0; tmp2<nrqs; tmp2++) {
1305: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1306: idex = pa[idex2];
1307: sbuf1_i = sbuf1[idex];
1308: jmax = sbuf1_i[0];
1309: ct1 = 2*jmax + 1;
1310: ct2 = 0;
1311: rbuf2_i = rbuf2[idex2];
1312: rbuf3_i = rbuf3[idex2];
1313: rbuf4_i = rbuf4[idex2];
1314: for (j=1; j<=jmax; j++) {
1315: is_no = sbuf1_i[2*j-1];
1316: rmap_i = rmap[is_no];
1317: cmap_i = cmap[is_no];
1318: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1319: imat_ilen = mat->ilen;
1320: imat_j = mat->j;
1321: imat_i = mat->i;
1322: imat_a = mat->a;
1323: max1 = sbuf1_i[2*j];
1324: for (k=0; k<max1; k++,ct1++) {
1325: row = sbuf1_i[ct1];
1326: row = rmap_i[row];
1327: ilen = imat_ilen[row];
1328: mat_i = imat_i[row] ;
1329: mat_a = imat_a + mat_i;
1330: mat_j = imat_j + mat_i;
1331: max2 = rbuf2_i[ct1];
1332: for (l=0; l<max2; l++,ct2++) {
1333: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1334: *mat_j++ = tcol - 1;
1335: *mat_a++ = rbuf4_i[ct2];
1336: ilen++;
1337: }
1338: }
1339: imat_ilen[row] = ilen;
1340: }
1341: }
1342: }
1343: }
1344: PetscFree(r_status4);
1345: PetscFree(r_waits4);
1346: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1347: PetscFree(s_waits4);
1348: PetscFree(s_status4);
1350: /* Restore the indices */
1351: for (i=0; i<ismax; i++) {
1352: ISRestoreIndices(isrow[i],irow+i);
1353: ISRestoreIndices(iscol[i],icol+i);
1354: }
1356: /* Destroy allocated memory */
1357: PetscFree(irow);
1358: PetscFree(w1);
1359: PetscFree(pa);
1361: PetscFree(sbuf1);
1362: PetscFree(rbuf2);
1363: for (i=0; i<nrqr; ++i) {
1364: PetscFree(sbuf2[i]);
1365: }
1366: for (i=0; i<nrqs; ++i) {
1367: PetscFree(rbuf3[i]);
1368: PetscFree(rbuf4[i]);
1369: }
1371: PetscFree(sbuf2);
1372: PetscFree(rbuf3);
1373: PetscFree(rbuf4);
1374: PetscFree(sbuf_aj[0]);
1375: PetscFree(sbuf_aj);
1376: PetscFree(sbuf_aa[0]);
1377: PetscFree(sbuf_aa);
1378:
1379: PetscFree(cmap);
1380: PetscFree(rmap);
1381: PetscFree(lens);
1383: for (i=0; i<ismax; i++) {
1384: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1385: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1386: }
1387: return(0);
1388: }