Actual source code: mpiov.c
1: /*$Id: mpiov.c,v 1.26.1.76.2.22 2001/09/07 20:09:38 bsmith Exp $*/
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include src/mat/impls/aij/mpi/mpiaij.h
7: #include petscbt.h
9: static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
10: static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
11: static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
12: EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13: EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
15: int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS *is,int ov)
16: {
17: int i,ierr;
20: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21: for (i=0; i<ov; ++i) {
22: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
23: }
24: return(0);
25: }
27: /*
28: Sample message format:
29: If a processor A wants processor B to process some elements corresponding
30: to index sets is[1],is[5]
31: mesg [0] = 2 (no of index sets in the mesg)
32: -----------
33: mesg [1] = 1 => is[1]
34: mesg [2] = sizeof(is[1]);
35: -----------
36: mesg [3] = 5 => is[5]
37: mesg [4] = sizeof(is[5]);
38: -----------
39: mesg [5]
40: mesg [n] datas[1]
41: -----------
42: mesg[n+1]
43: mesg[m] data(is[5])
44: -----------
45:
46: Notes:
47: nrqs - no of requests sent (or to be sent out)
48: nrqr - no of requests recieved (which have to be or which have been processed
49: */
50: static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS *is)
51: {
52: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
53: int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
54: int size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
55: int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
56: int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
57: PetscBT *table;
58: MPI_Comm comm;
59: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
60: MPI_Status *s_status,*recv_status;
63: comm = C->comm;
64: size = c->size;
65: rank = c->rank;
66: m = C->M;
68: PetscObjectGetNewTag((PetscObject)C,&tag1);
69: PetscObjectGetNewTag((PetscObject)C,&tag2);
70:
71: len = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
72: ierr = PetscMalloc(len,&idx);
73: n = (int*)(idx + imax);
74: rtable = n + imax;
75:
76: for (i=0; i<imax; i++) {
77: ISGetIndices(is[i],&idx[i]);
78: ISGetLocalSize(is[i],&n[i]);
79: }
80:
81: /* Create hash table for the mapping :row -> proc*/
82: for (i=0,j=0; i<size; i++) {
83: len = c->rowners[i+1];
84: for (; j<len; j++) {
85: rtable[j] = i;
86: }
87: }
89: /* evaluate communication - mesg to who,length of mesg, and buffer space
90: required. Based on this, buffers are allocated, and data copied into them*/
91: PetscMalloc(size*4*sizeof(int),&w1);/* mesg size */
92: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
93: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
94: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
95: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
96: for (i=0; i<imax; i++) {
97: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
98: idx_i = idx[i];
99: len = n[i];
100: for (j=0; j<len; j++) {
101: row = idx_i[j];
102: if (row < 0) {
103: SETERRQ(1,"Index set cannot have negative entries");
104: }
105: proc = rtable[row];
106: w4[proc]++;
107: }
108: for (j=0; j<size; j++){
109: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
110: }
111: }
113: nrqs = 0; /* no of outgoing messages */
114: msz = 0; /* total mesg length (for all proc */
115: w1[rank] = 0; /* no mesg sent to intself */
116: w3[rank] = 0;
117: for (i=0; i<size; i++) {
118: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
119: }
120: /* pa - is list of processors to communicate with */
121: PetscMalloc((nrqs+1)*sizeof(int),&pa);
122: for (i=0,j=0; i<size; i++) {
123: if (w1[i]) {pa[j] = i; j++;}
124: }
126: /* Each message would have a header = 1 + 2*(no of IS) + data */
127: for (i=0; i<nrqs; i++) {
128: j = pa[i];
129: w1[j] += w2[j] + 2*w3[j];
130: msz += w1[j];
131: }
133: /* Determine the number of messages to expect, their lengths, from from-ids */
134: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
135: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
137: /* Now post the Irecvs corresponding to these messages */
138: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
140: /* Allocate Memory for outgoing messages */
141: len = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
142: PetscMalloc(len,&outdat);
143: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
144: PetscMemzero(outdat,2*size*sizeof(int*));
145: tmp = (int*)(outdat + 2*size);
146: ctr = tmp + msz;
148: {
149: int *iptr = tmp,ict = 0;
150: for (i=0; i<nrqs; i++) {
151: j = pa[i];
152: iptr += ict;
153: outdat[j] = iptr;
154: ict = w1[j];
155: }
156: }
158: /* Form the outgoing messages */
159: /*plug in the headers*/
160: for (i=0; i<nrqs; i++) {
161: j = pa[i];
162: outdat[j][0] = 0;
163: ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
164: ptr[j] = outdat[j] + 2*w3[j] + 1;
165: }
166:
167: /* Memory for doing local proc's work*/
168: {
169: int *d_p;
170: char *t_p;
172: len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
173: (m)*imax*sizeof(int) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
174: ierr = PetscMalloc(len,&table);
175: ierr = PetscMemzero(table,len);
176: data = (int **)(table + imax);
177: isz = (int *)(data + imax);
178: d_p = (int *)(isz + imax);
179: t_p = (char *)(d_p + m*imax);
180: for (i=0; i<imax; i++) {
181: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
182: data[i] = d_p + (m)*i;
183: }
184: }
186: /* Parse the IS and update local tables and the outgoing buf with the data*/
187: {
188: int n_i,*data_i,isz_i,*outdat_j,ctr_j;
189: PetscBT table_i;
191: for (i=0; i<imax; i++) {
192: ierr = PetscMemzero(ctr,size*sizeof(int));
193: n_i = n[i];
194: table_i = table[i];
195: idx_i = idx[i];
196: data_i = data[i];
197: isz_i = isz[i];
198: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
199: row = idx_i[j];
200: proc = rtable[row];
201: if (proc != rank) { /* copy to the outgoing buffer */
202: ctr[proc]++;
203: *ptr[proc] = row;
204: ptr[proc]++;
205: } else { /* Update the local table */
206: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
207: }
208: }
209: /* Update the headers for the current IS */
210: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
211: if ((ctr_j = ctr[j])) {
212: outdat_j = outdat[j];
213: k = ++outdat_j[0];
214: outdat_j[2*k] = ctr_j;
215: outdat_j[2*k-1] = i;
216: }
217: }
218: isz[i] = isz_i;
219: }
220: }
221:
224: /* Now post the sends */
225: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
226: for (i=0; i<nrqs; ++i) {
227: j = pa[i];
228: MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
229: }
230:
231: /* No longer need the original indices*/
232: for (i=0; i<imax; ++i) {
233: ISRestoreIndices(is[i],idx+i);
234: }
235: PetscFree(idx);
237: for (i=0; i<imax; ++i) {
238: ISDestroy(is[i]);
239: }
240:
241: /* Do Local work*/
242: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
244: /* Receive messages*/
245: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
246: MPI_Waitall(nrqr,r_waits1,recv_status);
247:
248: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
249: MPI_Waitall(nrqs,s_waits1,s_status);
251: /* Phase 1 sends are complete - deallocate buffers */
252: PetscFree(outdat);
253: PetscFree(w1);
255: PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
256: PetscMalloc((nrqr+1)*sizeof(int),&isz1);
257: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
258: PetscFree(rbuf);
260:
261: /* Send the data back*/
262: /* Do a global reduction to know the buffer space req for incoming messages*/
263: {
264: int *rw1;
265:
266: PetscMalloc(size*sizeof(int),&rw1);
267: PetscMemzero(rw1,size*sizeof(int));
269: for (i=0; i<nrqr; ++i) {
270: proc = recv_status[i].MPI_SOURCE;
271: if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
272: rw1[proc] = isz1[i];
273: }
274: PetscFree(onodes1);
275: PetscFree(olengths1);
277: /* Determine the number of messages to expect, their lengths, from from-ids */
278: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
279: PetscFree(rw1);
280: }
281: /* Now post the Irecvs corresponding to these messages */
282: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
284: /* Now post the sends */
285: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
286: for (i=0; i<nrqr; ++i) {
287: j = recv_status[i].MPI_SOURCE;
288: MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
289: }
291: /* receive work done on other processors*/
292: {
293: int index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
294: PetscBT table_i;
295: MPI_Status *status2;
296:
297: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&status2);
298: for (i=0; i<nrqs; ++i) {
299: MPI_Waitany(nrqs,r_waits2,&index,status2+i);
300: /* Process the message*/
301: rbuf2_i = rbuf2[index];
302: ct1 = 2*rbuf2_i[0]+1;
303: jmax = rbuf2[index][0];
304: for (j=1; j<=jmax; j++) {
305: max = rbuf2_i[2*j];
306: is_no = rbuf2_i[2*j-1];
307: isz_i = isz[is_no];
308: data_i = data[is_no];
309: table_i = table[is_no];
310: for (k=0; k<max; k++,ct1++) {
311: row = rbuf2_i[ct1];
312: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
313: }
314: isz[is_no] = isz_i;
315: }
316: }
318: MPI_Waitall(nrqr,s_waits2,status2);
319: PetscFree(status2);
320: }
321:
322: for (i=0; i<imax; ++i) {
323: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
324: }
325:
326: PetscFree(onodes2);
327: PetscFree(olengths2);
329: PetscFree(pa);
330: PetscFree(rbuf2);
331: PetscFree(s_waits1);
332: PetscFree(r_waits1);
333: PetscFree(s_waits2);
334: PetscFree(r_waits2);
335: PetscFree(table);
336: PetscFree(s_status);
337: PetscFree(recv_status);
338: PetscFree(xdata[0]);
339: PetscFree(xdata);
340: PetscFree(isz1);
341: return(0);
342: }
344: /*
345: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
346: the work on the local processor.
348: Inputs:
349: C - MAT_MPIAIJ;
350: imax - total no of index sets processed at a time;
351: table - an array of char - size = m bits.
352:
353: Output:
354: isz - array containing the count of the solution elements correspondign
355: to each index set;
356: data - pointer to the solutions
357: */
358: static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
359: {
360: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
361: Mat A = c->A,B = c->B;
362: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
363: int start,end,val,max,rstart,cstart,ashift,bshift,*ai,*aj;
364: int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
365: PetscBT table_i;
368: rstart = c->rstart;
369: cstart = c->cstart;
370: ashift = a->indexshift;
371: ai = a->i;
372: aj = a->j +ashift;
373: bshift = b->indexshift;
374: bi = b->i;
375: bj = b->j +bshift;
376: garray = c->garray;
378:
379: for (i=0; i<imax; i++) {
380: data_i = data[i];
381: table_i = table[i];
382: isz_i = isz[i];
383: for (j=0,max=isz[i]; j<max; j++) {
384: row = data_i[j] - rstart;
385: start = ai[row];
386: end = ai[row+1];
387: for (k=start; k<end; k++) { /* Amat */
388: val = aj[k] + ashift + cstart;
389: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
390: }
391: start = bi[row];
392: end = bi[row+1];
393: for (k=start; k<end; k++) { /* Bmat */
394: val = garray[bj[k]+bshift];
395: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
396: }
397: }
398: isz[i] = isz_i;
399: }
400: return(0);
401: }
403: /*
404: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
405: and return the output
407: Input:
408: C - the matrix
409: nrqr - no of messages being processed.
410: rbuf - an array of pointers to the recieved requests
411:
412: Output:
413: xdata - array of messages to be sent back
414: isz1 - size of each message
416: For better efficiency perhaps we should malloc seperately each xdata[i],
417: then if a remalloc is required we need only copy the data for that one row
418: rather then all previous rows as it is now where a single large chunck of
419: memory is used.
421: */
422: static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
423: {
424: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
425: Mat A = c->A,B = c->B;
426: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
427: int rstart,cstart,ashift,bshift,*ai,*aj,*bi,*bj,*garray,i,j,k;
428: int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
429: int val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
430: int *rbuf_i,kmax,rbuf_0,ierr;
431: PetscBT xtable;
434: rank = c->rank;
435: m = C->M;
436: rstart = c->rstart;
437: cstart = c->cstart;
438: ashift = a->indexshift;
439: ai = a->i;
440: aj = a->j +ashift;
441: bshift = b->indexshift;
442: bi = b->i;
443: bj = b->j +bshift;
444: garray = c->garray;
445:
446:
447: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
448: rbuf_i = rbuf[i];
449: rbuf_0 = rbuf_i[0];
450: ct += rbuf_0;
451: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
452: }
453:
454: if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
455: else max1 = 1;
456: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
457: ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
458: ++no_malloc;
459: ierr = PetscBTCreate(m,xtable);
460: ierr = PetscMemzero(isz1,nrqr*sizeof(int));
461:
462: ct3 = 0;
463: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
464: rbuf_i = rbuf[i];
465: rbuf_0 = rbuf_i[0];
466: ct1 = 2*rbuf_0+1;
467: ct2 = ct1;
468: ct3 += ct1;
469: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
470: PetscBTMemzero(m,xtable);
471: oct2 = ct2;
472: kmax = rbuf_i[2*j];
473: for (k=0; k<kmax; k++,ct1++) {
474: row = rbuf_i[ct1];
475: if (!PetscBTLookupSet(xtable,row)) {
476: if (!(ct3 < mem_estimate)) {
477: new_estimate = (int)(1.5*mem_estimate)+1;
478: ierr = PetscMalloc(new_estimate*sizeof(int),&tmp);
479: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
480: ierr = PetscFree(xdata[0]);
481: xdata[0] = tmp;
482: mem_estimate = new_estimate; ++no_malloc;
483: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
484: }
485: xdata[i][ct2++] = row;
486: ct3++;
487: }
488: }
489: for (k=oct2,max2=ct2; k<max2; k++) {
490: row = xdata[i][k] - rstart;
491: start = ai[row];
492: end = ai[row+1];
493: for (l=start; l<end; l++) {
494: val = aj[l] + ashift + cstart;
495: if (!PetscBTLookupSet(xtable,val)) {
496: if (!(ct3 < mem_estimate)) {
497: new_estimate = (int)(1.5*mem_estimate)+1;
498: ierr = PetscMalloc(new_estimate*sizeof(int),&tmp);
499: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
500: ierr = PetscFree(xdata[0]);
501: xdata[0] = tmp;
502: mem_estimate = new_estimate; ++no_malloc;
503: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
504: }
505: xdata[i][ct2++] = val;
506: ct3++;
507: }
508: }
509: start = bi[row];
510: end = bi[row+1];
511: for (l=start; l<end; l++) {
512: val = garray[bj[l]+bshift];
513: if (!PetscBTLookupSet(xtable,val)) {
514: if (!(ct3 < mem_estimate)) {
515: new_estimate = (int)(1.5*mem_estimate)+1;
516: ierr = PetscMalloc(new_estimate*sizeof(int),&tmp);
517: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
518: PetscFree(xdata[0]);
519: xdata[0] = tmp;
520: mem_estimate = new_estimate; ++no_malloc;
521: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
522: }
523: xdata[i][ct2++] = val;
524: ct3++;
525: }
526: }
527: }
528: /* Update the header*/
529: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
530: xdata[i][2*j-1] = rbuf_i[2*j-1];
531: }
532: xdata[i][0] = rbuf_0;
533: xdata[i+1] = xdata[i] + ct2;
534: isz1[i] = ct2; /* size of each message */
535: }
536: PetscBTDestroy(xtable);
537: PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %dn",rank,mem_estimate, ct3,no_malloc);
538: return(0);
539: }
540: /* -------------------------------------------------------------------------*/
541: EXTERN int MatGetSubMatrices_MPIAIJ_Local(Mat,int,IS*,IS*,MatReuse,Mat*);
542: EXTERN int MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
543: /*
544: Every processor gets the entire matrix
545: */
546: int MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat **Bin)
547: {
548: Mat B;
549: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
550: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
551: int ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
552: int m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
553: PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
556: MPI_Comm_size(A->comm,&size);
557: MPI_Comm_rank(A->comm,&rank);
559: if (scall == MAT_INITIAL_MATRIX) {
560: /* ----------------------------------------------------------------
561: Tell every processor the number of nonzeros per row
562: */
563: PetscMalloc(A->M*sizeof(int),&lens);
564: for (i=a->rstart; i<a->rend; i++) {
565: lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
566: }
567: sendcount = a->rend - a->rstart;
568: PetscMalloc(2*size*sizeof(int),&recvcounts);
569: displs = recvcounts + size;
570: for (i=0; i<size; i++) {
571: recvcounts[i] = a->rowners[i+1] - a->rowners[i];
572: displs[i] = a->rowners[i];
573: }
574: ierr = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);
576: /* ---------------------------------------------------------------
577: Create the sequential matrix
578: */
579: ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,A->M,A->N,0,lens,&B);
580: ierr = PetscMalloc(sizeof(Mat),Bin);
581: **Bin = B;
582: b = (Mat_SeqAIJ *)B->data;
584: /*--------------------------------------------------------------------
585: Copy my part of matrix column indices over
586: */
587: sendcount = ad->nz + bd->nz;
588: jsendbuf = b->j + b->i[rstarts[rank]];
589: a_jsendbuf = ad->j;
590: b_jsendbuf = bd->j;
591: n = a->rend - a->rstart;
592: cnt = 0;
593: for (i=0; i<n; i++) {
595: /* put in lower diagonal portion */
596: m = bd->i[i+1] - bd->i[i];
597: while (m > 0) {
598: /* is it above diagonal (in bd (compressed) numbering) */
599: if (garray[*b_jsendbuf] > a->rstart + i) break;
600: jsendbuf[cnt++] = garray[*b_jsendbuf++];
601: m--;
602: }
604: /* put in diagonal portion */
605: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
606: jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
607: }
609: /* put in upper diagonal portion */
610: while (m-- > 0) {
611: jsendbuf[cnt++] = garray[*b_jsendbuf++];
612: }
613: }
614: if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
616: /*--------------------------------------------------------------------
617: Gather all column indices to all processors
618: */
619: for (i=0; i<size; i++) {
620: recvcounts[i] = 0;
621: for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
622: recvcounts[i] += lens[j];
623: }
624: }
625: displs[0] = 0;
626: for (i=1; i<size; i++) {
627: displs[i] = displs[i-1] + recvcounts[i-1];
628: }
629: MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);
631: /*--------------------------------------------------------------------
632: Assemble the matrix into useable form (note numerical values not yet set)
633: */
634: /* set the b->ilen (length of each row) values */
635: PetscMemcpy(b->ilen,lens,A->M*sizeof(int));
636: /* set the b->i indices */
637: b->i[0] = 0;
638: for (i=1; i<=A->M; i++) {
639: b->i[i] = b->i[i-1] + lens[i-1];
640: }
641: PetscFree(lens);
642: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
643: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
644: } else {
645: B = **Bin;
646: b = (Mat_SeqAIJ *)B->data;
647: }
649: /*--------------------------------------------------------------------
650: Copy my part of matrix numerical values into the values location
651: */
652: sendcount = ad->nz + bd->nz;
653: sendbuf = b->a + b->i[rstarts[rank]];
654: a_sendbuf = ad->a;
655: b_sendbuf = bd->a;
656: b_sendj = bd->j;
657: n = a->rend - a->rstart;
658: cnt = 0;
659: for (i=0; i<n; i++) {
661: /* put in lower diagonal portion */
662: m = bd->i[i+1] - bd->i[i];
663: while (m > 0) {
664: /* is it above diagonal (in bd (compressed) numbering) */
665: if (garray[*b_sendj] > a->rstart + i) break;
666: sendbuf[cnt++] = *b_sendbuf++;
667: m--;
668: b_sendj++;
669: }
671: /* put in diagonal portion */
672: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
673: sendbuf[cnt++] = *a_sendbuf++;
674: }
676: /* put in upper diagonal portion */
677: while (m-- > 0) {
678: sendbuf[cnt++] = *b_sendbuf++;
679: b_sendj++;
680: }
681: }
682: if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
683:
684: /* -----------------------------------------------------------------
685: Gather all numerical values to all processors
686: */
687: if (!recvcounts) {
688: ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);
689: displs = recvcounts + size;
690: }
691: for (i=0; i<size; i++) {
692: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
693: }
694: displs[0] = 0;
695: for (i=1; i<size; i++) {
696: displs[i] = displs[i-1] + recvcounts[i-1];
697: }
698: recvbuf = b->a;
699: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);
700: PetscFree(recvcounts);
702: return(0);
703: }
705: int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat)
706: {
707: int nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
708: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
711: /*
712: Check for special case each processor gets entire matrix
713: */
714: if (ismax == 1 && C->M == C->N) {
715: ISIdentity(*isrow,&rowflag);
716: ISIdentity(*iscol,&colflag);
717: ISGetLocalSize(*isrow,&nrow);
718: ISGetLocalSize(*iscol,&ncol);
719: if (rowflag && colflag && nrow == C->M && ncol == C->N) {
720: wantallmatrix = PETSC_TRUE;
721: PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
722: }
723: }
724: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);
725: if (twantallmatrix) {
726: MatGetSubMatrix_MPIAIJ_All(C,scall,submat);
727: return(0);
728: }
730: /* Allocate memory to hold all the submatrices */
731: if (scall != MAT_REUSE_MATRIX) {
732: PetscMalloc((ismax+1)*sizeof(Mat),submat);
733: }
734: /* Determine the number of stages through which submatrices are done */
735: nmax = 20*1000000 / (C->N * sizeof(int));
736: if (!nmax) nmax = 1;
737: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
739: /* Make sure every processor loops through the nstages */
740: MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
742: for (i=0,pos=0; i<nstages; i++) {
743: if (pos+nmax <= ismax) max_no = nmax;
744: else if (pos == ismax) max_no = 0;
745: else max_no = ismax-pos;
746: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
747: pos += max_no;
748: }
749: return(0);
750: }
751: /* -------------------------------------------------------------------------*/
752: int MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats)
753: {
754: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
755: Mat A = c->A;
756: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
757: int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
758: int **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
759: int nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
760: int **rbuf3,*req_source,**sbuf_aj,ashift,**rbuf2,max1,max2,**rmap;
761: int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
762: int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
763: int *rmap_i,tag0,tag1,tag2,tag3;
764: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
765: MPI_Request *r_waits4,*s_waits3,*s_waits4;
766: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
767: MPI_Status *r_status3,*r_status4,*s_status4;
768: MPI_Comm comm;
769: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
770: PetscTruth sorted;
771: int *onodes1,*olengths1;
774: comm = C->comm;
775: tag0 = C->tag;
776: size = c->size;
777: rank = c->rank;
778: m = C->M;
779: ashift = a->indexshift;
780:
781: /* Get some new tags to keep the communication clean */
782: PetscObjectGetNewTag((PetscObject)C,&tag1);
783: PetscObjectGetNewTag((PetscObject)C,&tag2);
784: PetscObjectGetNewTag((PetscObject)C,&tag3);
786: /* Check if the col indices are sorted */
787: for (i=0; i<ismax; i++) {
788: ISSorted(isrow[i],&sorted);
789: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
790: ISSorted(iscol[i],&sorted);
791: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
792: }
794: len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
795: ierr = PetscMalloc(len,&irow);
796: icol = irow + ismax;
797: nrow = (int*)(icol + ismax);
798: ncol = nrow + ismax;
799: rtable = ncol + ismax;
801: for (i=0; i<ismax; i++) {
802: ISGetIndices(isrow[i],&irow[i]);
803: ISGetIndices(iscol[i],&icol[i]);
804: ISGetLocalSize(isrow[i],&nrow[i]);
805: ISGetLocalSize(iscol[i],&ncol[i]);
806: }
808: /* Create hash table for the mapping :row -> proc*/
809: for (i=0,j=0; i<size; i++) {
810: jmax = c->rowners[i+1];
811: for (; j<jmax; j++) {
812: rtable[j] = i;
813: }
814: }
816: /* evaluate communication - mesg to who, length of mesg, and buffer space
817: required. Based on this, buffers are allocated, and data copied into them*/
818: ierr = PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
819: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
820: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
821: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
822: ierr = PetscMemzero(w1,size*3*sizeof(int)); /* initialize work vector*/
823: for (i=0; i<ismax; i++) {
824: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialize work vector*/
825: jmax = nrow[i];
826: irow_i = irow[i];
827: for (j=0; j<jmax; j++) {
828: row = irow_i[j];
829: proc = rtable[row];
830: w4[proc]++;
831: }
832: for (j=0; j<size; j++) {
833: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
834: }
835: }
836:
837: nrqs = 0; /* no of outgoing messages */
838: msz = 0; /* total mesg length (for all procs) */
839: w1[rank] = 0; /* no mesg sent to self */
840: w3[rank] = 0;
841: for (i=0; i<size; i++) {
842: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
843: }
844: PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
845: for (i=0,j=0; i<size; i++) {
846: if (w1[i]) { pa[j] = i; j++; }
847: }
849: /* Each message would have a header = 1 + 2*(no of IS) + data */
850: for (i=0; i<nrqs; i++) {
851: j = pa[i];
852: w1[j] += w2[j] + 2* w3[j];
853: msz += w1[j];
854: }
856: /* Determine the number of messages to expect, their lengths, from from-ids */
857: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
858: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
860: /* Now post the Irecvs corresponding to these messages */
861: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
862:
863: PetscFree(onodes1);
864: PetscFree(olengths1);
865:
866: /* Allocate Memory for outgoing messages */
867: len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
868: ierr = PetscMalloc(len,&sbuf1);
869: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
870: ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));
871: /* allocate memory for outgoing data + buf to receive the first reply */
872: tmp = (int*)(ptr + size);
873: ctr = tmp + 2*msz;
875: {
876: int *iptr = tmp,ict = 0;
877: for (i=0; i<nrqs; i++) {
878: j = pa[i];
879: iptr += ict;
880: sbuf1[j] = iptr;
881: ict = w1[j];
882: }
883: }
885: /* Form the outgoing messages */
886: /* Initialize the header space */
887: for (i=0; i<nrqs; i++) {
888: j = pa[i];
889: sbuf1[j][0] = 0;
890: ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
891: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
892: }
893:
894: /* Parse the isrow and copy data into outbuf */
895: for (i=0; i<ismax; i++) {
896: ierr = PetscMemzero(ctr,size*sizeof(int));
897: irow_i = irow[i];
898: jmax = nrow[i];
899: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
900: row = irow_i[j];
901: proc = rtable[row];
902: if (proc != rank) { /* copy to the outgoing buf*/
903: ctr[proc]++;
904: *ptr[proc] = row;
905: ptr[proc]++;
906: }
907: }
908: /* Update the headers for the current IS */
909: for (j=0; j<size; j++) { /* Can Optimise this loop too */
910: if ((ctr_j = ctr[j])) {
911: sbuf1_j = sbuf1[j];
912: k = ++sbuf1_j[0];
913: sbuf1_j[2*k] = ctr_j;
914: sbuf1_j[2*k-1] = i;
915: }
916: }
917: }
919: /* Now post the sends */
920: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
921: for (i=0; i<nrqs; ++i) {
922: j = pa[i];
923: MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
924: }
926: /* Post Receives to capture the buffer size */
927: ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
928: ierr = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
929: rbuf2[0] = tmp + msz;
930: for (i=1; i<nrqs; ++i) {
931: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
932: }
933: for (i=0; i<nrqs; ++i) {
934: j = pa[i];
935: MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
936: }
938: /* Send to other procs the buf size they should allocate */
939:
941: /* Receive messages*/
942: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
943: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
944: len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
945: ierr = PetscMalloc(len,&sbuf2);
946: req_size = (int*)(sbuf2 + nrqr);
947: req_source = req_size + nrqr;
948:
949: {
950: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
951: int *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
952: int *sbuf2_i;
954: for (i=0; i<nrqr; ++i) {
955: MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);
956: req_size[index] = 0;
957: rbuf1_i = rbuf1[index];
958: start = 2*rbuf1_i[0] + 1;
959: ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);
960: ierr = PetscMalloc((end+1)*sizeof(int),&sbuf2[index]);
961: sbuf2_i = sbuf2[index];
962: for (j=start; j<end; j++) {
963: id = rbuf1_i[j] - rstart;
964: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
965: sbuf2_i[j] = ncols;
966: req_size[index] += ncols;
967: }
968: req_source[index] = r_status1[i].MPI_SOURCE;
969: /* form the header */
970: sbuf2_i[0] = req_size[index];
971: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
972: MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);
973: }
974: }
975: PetscFree(r_status1);
976: PetscFree(r_waits1);
978: /* recv buffer sizes */
979: /* Receive messages*/
980:
981: PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
982: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
983: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
984: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
985: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
987: for (i=0; i<nrqs; ++i) {
988: MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);
989: PetscMalloc((rbuf2[index][0]+1)*sizeof(int),&rbuf3[index]);
990: PetscMalloc((rbuf2[index][0]+1)*sizeof(PetscScalar),&rbuf4[index]);
991: MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);
992: MPI_Irecv(rbuf4[index],rbuf2[index][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);
993: }
994: PetscFree(r_status2);
995: PetscFree(r_waits2);
996:
997: /* Wait on sends1 and sends2 */
998: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
999: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1001: MPI_Waitall(nrqs,s_waits1,s_status1);
1002: MPI_Waitall(nrqr,s_waits2,s_status2);
1003: PetscFree(s_status1);
1004: PetscFree(s_status2);
1005: PetscFree(s_waits1);
1006: PetscFree(s_waits2);
1008: /* Now allocate buffers for a->j, and send them off */
1009: PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);
1010: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1011: PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
1012: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1013:
1014: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1015: {
1016: int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1017: int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1018: int *a_j = a->j,*b_j = b->j,shift = a->indexshift,ctmp,*t_cols;
1020: for (i=0; i<nrqr; i++) {
1021: rbuf1_i = rbuf1[i];
1022: sbuf_aj_i = sbuf_aj[i];
1023: ct1 = 2*rbuf1_i[0] + 1;
1024: ct2 = 0;
1025: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1026: kmax = rbuf1[i][2*j];
1027: for (k=0; k<kmax; k++,ct1++) {
1028: row = rbuf1_i[ct1] - rstart;
1029: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1030: ncols = nzA + nzB;
1031: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1033: /* load the column indices for this row into cols*/
1034: cols = sbuf_aj_i + ct2;
1035: if (!shift) {
1036: for (l=0; l<nzB; l++) {
1037: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
1038: else break;
1039: }
1040: imark = l;
1041: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
1042: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1043: } else {
1044: MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0);
1045: PetscMemcpy(cols,t_cols,ncols*sizeof(int));
1046: MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0);
1047: }
1049: ct2 += ncols;
1050: }
1051: }
1052: MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
1053: }
1054: }
1055: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1056: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1058: /* Allocate buffers for a->a, and send them off */
1059: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1060: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1061: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1062: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1063:
1064: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1065: {
1066: int nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1067: int cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1068: int *b_j = b->j,shift = a->indexshift;
1069: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*t_vals;
1070:
1071: for (i=0; i<nrqr; i++) {
1072: rbuf1_i = rbuf1[i];
1073: sbuf_aa_i = sbuf_aa[i];
1074: ct1 = 2*rbuf1_i[0]+1;
1075: ct2 = 0;
1076: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1077: kmax = rbuf1_i[2*j];
1078: for (k=0; k<kmax; k++,ct1++) {
1079: row = rbuf1_i[ct1] - rstart;
1080: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1081: ncols = nzA + nzB;
1082: cworkB = b_j + b_i[row];
1083: vworkA = a_a + a_i[row];
1084: vworkB = b_a + b_i[row];
1086: /* load the column values for this row into vals*/
1087: vals = sbuf_aa_i+ct2;
1088: if (!shift) {
1089: for (l=0; l<nzB; l++) {
1090: if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l];
1091: else break;
1092: }
1093: imark = l;
1094: for (l=0; l<nzA; l++) vals[imark+l] = vworkA[l];
1095: for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1096: } else {
1097: MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals);
1098: PetscMemcpy(vals,t_vals,ncols*sizeof(PetscScalar));
1099: MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals);
1100: }
1101: ct2 += ncols;
1102: }
1103: }
1104: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1105: }
1106: }
1107: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1108: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1109: PetscFree(rbuf1);
1111: /* Form the matrix */
1112: /* create col map */
1113: {
1114: int *icol_i;
1115:
1116: len = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1117: ierr = PetscMalloc(len,&cmap);
1118: cmap[0] = (int *)(cmap + ismax);
1119: ierr = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));
1120: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1121: for (i=0; i<ismax; i++) {
1122: jmax = ncol[i];
1123: icol_i = icol[i];
1124: cmap_i = cmap[i];
1125: for (j=0; j<jmax; j++) {
1126: cmap_i[icol_i[j]] = j+1;
1127: }
1128: }
1129: }
1131: /* Create lens which is required for MatCreate... */
1132: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1133: len = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1134: ierr = PetscMalloc(len,&lens);
1135: lens[0] = (int *)(lens + ismax);
1136: ierr = PetscMemzero(lens[0],j*sizeof(int));
1137: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1138:
1139: /* Update lens from local data */
1140: for (i=0; i<ismax; i++) {
1141: jmax = nrow[i];
1142: cmap_i = cmap[i];
1143: irow_i = irow[i];
1144: lens_i = lens[i];
1145: for (j=0; j<jmax; j++) {
1146: row = irow_i[j];
1147: proc = rtable[row];
1148: if (proc == rank) {
1149: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1150: for (k=0; k<ncols; k++) {
1151: if (cmap_i[cols[k]]) { lens_i[j]++;}
1152: }
1153: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1154: }
1155: }
1156: }
1157:
1158: /* Create row map*/
1159: len = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1160: ierr = PetscMalloc(len,&rmap);
1161: rmap[0] = (int *)(rmap + ismax);
1162: ierr = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));
1163: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1164: for (i=0; i<ismax; i++) {
1165: rmap_i = rmap[i];
1166: irow_i = irow[i];
1167: jmax = nrow[i];
1168: for (j=0; j<jmax; j++) {
1169: rmap_i[irow_i[j]] = j;
1170: }
1171: }
1172:
1173: /* Update lens from offproc data */
1174: {
1175: int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1177: for (tmp2=0; tmp2<nrqs; tmp2++) {
1178: MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1179: index = pa[i];
1180: sbuf1_i = sbuf1[index];
1181: jmax = sbuf1_i[0];
1182: ct1 = 2*jmax+1;
1183: ct2 = 0;
1184: rbuf2_i = rbuf2[i];
1185: rbuf3_i = rbuf3[i];
1186: for (j=1; j<=jmax; j++) {
1187: is_no = sbuf1_i[2*j-1];
1188: max1 = sbuf1_i[2*j];
1189: lens_i = lens[is_no];
1190: cmap_i = cmap[is_no];
1191: rmap_i = rmap[is_no];
1192: for (k=0; k<max1; k++,ct1++) {
1193: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1194: max2 = rbuf2_i[ct1];
1195: for (l=0; l<max2; l++,ct2++) {
1196: if (cmap_i[rbuf3_i[ct2]]) {
1197: lens_i[row]++;
1198: }
1199: }
1200: }
1201: }
1202: }
1203: }
1204: PetscFree(r_status3);
1205: PetscFree(r_waits3);
1206: MPI_Waitall(nrqr,s_waits3,s_status3);
1207: PetscFree(s_status3);
1208: PetscFree(s_waits3);
1210: /* Create the submatrices */
1211: if (scall == MAT_REUSE_MATRIX) {
1212: PetscTruth flag;
1214: /*
1215: Assumes new rows are same length as the old rows,hence bug!
1216: */
1217: for (i=0; i<ismax; i++) {
1218: mat = (Mat_SeqAIJ *)(submats[i]->data);
1219: if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1220: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1221: }
1222: PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);
1223: if (flag == PETSC_FALSE) {
1224: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1225: }
1226: /* Initial matrix as if empty */
1227: PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));
1228: submats[i]->factor = C->factor;
1229: }
1230: } else {
1231: for (i=0; i<ismax; i++) {
1232: MatCreateSeqAIJ(PETSC_COMM_SELF,nrow[i],ncol[i],0,lens[i],submats+i);
1233: }
1234: }
1236: /* Assemble the matrices */
1237: /* First assemble the local rows */
1238: {
1239: int ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1240: PetscScalar *imat_a;
1241:
1242: for (i=0; i<ismax; i++) {
1243: mat = (Mat_SeqAIJ*)submats[i]->data;
1244: imat_ilen = mat->ilen;
1245: imat_j = mat->j;
1246: imat_i = mat->i;
1247: imat_a = mat->a;
1248: cmap_i = cmap[i];
1249: rmap_i = rmap[i];
1250: irow_i = irow[i];
1251: jmax = nrow[i];
1252: for (j=0; j<jmax; j++) {
1253: row = irow_i[j];
1254: proc = rtable[row];
1255: if (proc == rank) {
1256: old_row = row;
1257: row = rmap_i[row];
1258: ilen_row = imat_ilen[row];
1259: ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1260: mat_i = imat_i[row] + ashift;
1261: mat_a = imat_a + mat_i;
1262: mat_j = imat_j + mat_i;
1263: for (k=0; k<ncols; k++) {
1264: if ((tcol = cmap_i[cols[k]])) {
1265: *mat_j++ = tcol - (!ashift);
1266: *mat_a++ = vals[k];
1267: ilen_row++;
1268: }
1269: }
1270: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1271: imat_ilen[row] = ilen_row;
1272: }
1273: }
1274: }
1275: }
1277: /* Now assemble the off proc rows*/
1278: {
1279: int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1280: int *imat_j,*imat_i;
1281: PetscScalar *imat_a,*rbuf4_i;
1283: for (tmp2=0; tmp2<nrqs; tmp2++) {
1284: MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1285: index = pa[i];
1286: sbuf1_i = sbuf1[index];
1287: jmax = sbuf1_i[0];
1288: ct1 = 2*jmax + 1;
1289: ct2 = 0;
1290: rbuf2_i = rbuf2[i];
1291: rbuf3_i = rbuf3[i];
1292: rbuf4_i = rbuf4[i];
1293: for (j=1; j<=jmax; j++) {
1294: is_no = sbuf1_i[2*j-1];
1295: rmap_i = rmap[is_no];
1296: cmap_i = cmap[is_no];
1297: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1298: imat_ilen = mat->ilen;
1299: imat_j = mat->j;
1300: imat_i = mat->i;
1301: imat_a = mat->a;
1302: max1 = sbuf1_i[2*j];
1303: for (k=0; k<max1; k++,ct1++) {
1304: row = sbuf1_i[ct1];
1305: row = rmap_i[row];
1306: ilen = imat_ilen[row];
1307: mat_i = imat_i[row] + ashift;
1308: mat_a = imat_a + mat_i;
1309: mat_j = imat_j + mat_i;
1310: max2 = rbuf2_i[ct1];
1311: for (l=0; l<max2; l++,ct2++) {
1312: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1313: *mat_j++ = tcol - (!ashift);
1314: *mat_a++ = rbuf4_i[ct2];
1315: ilen++;
1316: }
1317: }
1318: imat_ilen[row] = ilen;
1319: }
1320: }
1321: }
1322: }
1323: PetscFree(r_status4);
1324: PetscFree(r_waits4);
1325: MPI_Waitall(nrqr,s_waits4,s_status4);
1326: PetscFree(s_waits4);
1327: PetscFree(s_status4);
1329: /* Restore the indices */
1330: for (i=0; i<ismax; i++) {
1331: ISRestoreIndices(isrow[i],irow+i);
1332: ISRestoreIndices(iscol[i],icol+i);
1333: }
1335: /* Destroy allocated memory */
1336: PetscFree(irow);
1337: PetscFree(w1);
1338: PetscFree(pa);
1340: PetscFree(sbuf1);
1341: PetscFree(rbuf2);
1342: for (i=0; i<nrqr; ++i) {
1343: PetscFree(sbuf2[i]);
1344: }
1345: for (i=0; i<nrqs; ++i) {
1346: PetscFree(rbuf3[i]);
1347: PetscFree(rbuf4[i]);
1348: }
1350: PetscFree(sbuf2);
1351: PetscFree(rbuf3);
1352: PetscFree(rbuf4);
1353: PetscFree(sbuf_aj[0]);
1354: PetscFree(sbuf_aj);
1355: PetscFree(sbuf_aa[0]);
1356: PetscFree(sbuf_aa);
1357:
1358: PetscFree(cmap);
1359: PetscFree(rmap);
1360: PetscFree(lens);
1362: for (i=0; i<ismax; i++) {
1363: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1364: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1365: }
1366: return(0);
1367: }