Actual source code: baijov.c
1: #define PETSCMAT_DLL
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
12: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
13: EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
18: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19: {
21: PetscInt i,N=C->cmap.N, bs=C->rmap.bs;
22: IS *is_new;
25: PetscMalloc(imax*sizeof(IS),&is_new);
26: /* Convert the indices into block format */
27: ISCompressIndicesGeneral(N,bs,imax,is,is_new);
28: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
29: for (i=0; i<ov; ++i) {
30: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
31: }
32: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
33: ISExpandIndicesGeneral(N,bs,imax,is_new,is);
34: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
35: PetscFree(is_new);
36: return(0);
37: }
39: /*
40: Sample message format:
41: If a processor A wants processor B to process some elements corresponding
42: to index sets is[1], is[5]
43: mesg [0] = 2 (no of index sets in the mesg)
44: -----------
45: mesg [1] = 1 => is[1]
46: mesg [2] = sizeof(is[1]);
47: -----------
48: mesg [5] = 5 => is[5]
49: mesg [6] = sizeof(is[5]);
50: -----------
51: mesg [7]
52: mesg [n] data(is[1])
53: -----------
54: mesg[n+1]
55: mesg[m] data(is[5])
56: -----------
57:
58: Notes:
59: nrqs - no of requests sent (or to be sent out)
60: nrqr - no of requests recieved (which have to be or which have been processed
61: */
64: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
65: {
66: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
67: PetscInt **idx,*n,*w3,*w4,*rtable,**data,len,*idx_i;
69: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
70: PetscInt Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
71: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
72: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
73: PetscBT *table;
74: MPI_Comm comm;
75: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
76: MPI_Status *s_status,*recv_status;
79: comm = C->comm;
80: size = c->size;
81: rank = c->rank;
82: Mbs = c->Mbs;
84: PetscObjectGetNewTag((PetscObject)C,&tag1);
85: PetscObjectGetNewTag((PetscObject)C,&tag2);
87: len = (imax+1)*sizeof(PetscInt*)+ (imax + Mbs)*sizeof(PetscInt);
88: PetscMalloc(len,&idx);
89: n = (PetscInt*)(idx + imax);
90: rtable = n + imax;
91:
92: for (i=0; i<imax; i++) {
93: ISGetIndices(is[i],&idx[i]);
94: ISGetLocalSize(is[i],&n[i]);
95: }
96:
97: /* Create hash table for the mapping :row -> proc*/
98: for (i=0,j=0; i<size; i++) {
99: len = c->rangebs[i+1];
100: for (; j<len; j++) {
101: rtable[j] = i;
102: }
103: }
105: /* evaluate communication - mesg to who,length of mesg, and buffer space
106: required. Based on this, buffers are allocated, and data copied into them*/
107: PetscMalloc(size*2*sizeof(PetscInt)+size*2*sizeof(PetscMPIInt),&w1);/* mesg size */
108: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
109: w3 = (PetscInt*) (w2 + size); /* no of IS that needs to be sent to proc i */
110: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
111: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
112: for (i=0; i<imax; i++) {
113: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
114: idx_i = idx[i];
115: len = n[i];
116: for (j=0; j<len; j++) {
117: row = idx_i[j];
118: if (row < 0) {
119: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
120: }
121: proc = rtable[row];
122: w4[proc]++;
123: }
124: for (j=0; j<size; j++){
125: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
126: }
127: }
129: nrqs = 0; /* no of outgoing messages */
130: msz = 0; /* total mesg length (for all proc */
131: w1[rank] = 0; /* no mesg sent to itself */
132: w3[rank] = 0;
133: for (i=0; i<size; i++) {
134: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
135: }
136: /* pa - is list of processors to communicate with */
137: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
138: for (i=0,j=0; i<size; i++) {
139: if (w1[i]) {pa[j] = i; j++;}
140: }
142: /* Each message would have a header = 1 + 2*(no of IS) + data */
143: for (i=0; i<nrqs; i++) {
144: j = pa[i];
145: w1[j] += w2[j] + 2*w3[j];
146: msz += w1[j];
147: }
148:
149: /* Determine the number of messages to expect, their lengths, from from-ids */
150: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
151: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
153: /* Now post the Irecvs corresponding to these messages */
154: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
155:
156: /* Allocate Memory for outgoing messages */
157: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
158: PetscMalloc(len,&outdat);
159: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
160: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
161: tmp = (PetscInt*)(outdat + 2*size);
162: ctr = tmp + msz;
164: {
165: PetscInt *iptr = tmp,ict = 0;
166: for (i=0; i<nrqs; i++) {
167: j = pa[i];
168: iptr += ict;
169: outdat[j] = iptr;
170: ict = w1[j];
171: }
172: }
174: /* Form the outgoing messages */
175: /*plug in the headers*/
176: for (i=0; i<nrqs; i++) {
177: j = pa[i];
178: outdat[j][0] = 0;
179: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
180: ptr[j] = outdat[j] + 2*w3[j] + 1;
181: }
182:
183: /* Memory for doing local proc's work*/
184: {
185: PetscInt *d_p;
186: char *t_p;
188: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
189: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
190: PetscMalloc(len,&table);
191: PetscMemzero(table,len);
192: data = (PetscInt **)(table + imax);
193: isz = (PetscInt *)(data + imax);
194: d_p = (PetscInt *)(isz + imax);
195: t_p = (char *)(d_p + Mbs*imax);
196: for (i=0; i<imax; i++) {
197: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
198: data[i] = d_p + (Mbs)*i;
199: }
200: }
202: /* Parse the IS and update local tables and the outgoing buf with the data*/
203: {
204: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
205: PetscBT table_i;
207: for (i=0; i<imax; i++) {
208: PetscMemzero(ctr,size*sizeof(PetscInt));
209: n_i = n[i];
210: table_i = table[i];
211: idx_i = idx[i];
212: data_i = data[i];
213: isz_i = isz[i];
214: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
215: row = idx_i[j];
216: proc = rtable[row];
217: if (proc != rank) { /* copy to the outgoing buffer */
218: ctr[proc]++;
219: *ptr[proc] = row;
220: ptr[proc]++;
221: }
222: else { /* Update the local table */
223: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
224: }
225: }
226: /* Update the headers for the current IS */
227: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
228: if ((ctr_j = ctr[j])) {
229: outdat_j = outdat[j];
230: k = ++outdat_j[0];
231: outdat_j[2*k] = ctr_j;
232: outdat_j[2*k-1] = i;
233: }
234: }
235: isz[i] = isz_i;
236: }
237: }
238:
239: /* Now post the sends */
240: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
241: for (i=0; i<nrqs; ++i) {
242: j = pa[i];
243: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
244: }
245:
246: /* No longer need the original indices*/
247: for (i=0; i<imax; ++i) {
248: ISRestoreIndices(is[i],idx+i);
249: }
250: PetscFree(idx);
252: for (i=0; i<imax; ++i) {
253: ISDestroy(is[i]);
254: }
255:
256: /* Do Local work*/
257: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
259: /* Receive messages*/
260: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
261: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
262:
263: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
264: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
266: /* Phase 1 sends are complete - deallocate buffers */
267: PetscFree(outdat);
268: PetscFree(w1);
270: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
271: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
272: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
273: PetscFree(rbuf);
275: /* Send the data back*/
276: /* Do a global reduction to know the buffer space req for incoming messages*/
277: {
278: PetscMPIInt *rw1;
279:
280: PetscMalloc(size*sizeof(PetscInt),&rw1);
281: PetscMemzero(rw1,size*sizeof(PetscInt));
283: for (i=0; i<nrqr; ++i) {
284: proc = recv_status[i].MPI_SOURCE;
285: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
286: rw1[proc] = isz1[i];
287: }
288:
289: PetscFree(onodes1);
290: PetscFree(olengths1);
292: /* Determine the number of messages to expect, their lengths, from from-ids */
293: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
294: PetscFree(rw1);
295: }
296: /* Now post the Irecvs corresponding to these messages */
297: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
298:
299: /* Now post the sends */
300: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
301: for (i=0; i<nrqr; ++i) {
302: j = recv_status[i].MPI_SOURCE;
303: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
304: }
306: /* receive work done on other processors*/
307: {
308: PetscMPIInt idex;
309: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
310: PetscBT table_i;
311: MPI_Status *status2;
312:
313: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
315: for (i=0; i<nrqs; ++i) {
316: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
317: /* Process the message*/
318: rbuf2_i = rbuf2[idex];
319: ct1 = 2*rbuf2_i[0]+1;
320: jmax = rbuf2[idex][0];
321: for (j=1; j<=jmax; j++) {
322: max = rbuf2_i[2*j];
323: is_no = rbuf2_i[2*j-1];
324: isz_i = isz[is_no];
325: data_i = data[is_no];
326: table_i = table[is_no];
327: for (k=0; k<max; k++,ct1++) {
328: row = rbuf2_i[ct1];
329: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
330: }
331: isz[is_no] = isz_i;
332: }
333: }
334: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
335: PetscFree(status2);
336: }
337:
338: for (i=0; i<imax; ++i) {
339: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
340: }
341:
342:
343: PetscFree(onodes2);
344: PetscFree(olengths2);
346: PetscFree(pa);
347: PetscFree(rbuf2);
348: PetscFree(s_waits1);
349: PetscFree(r_waits1);
350: PetscFree(s_waits2);
351: PetscFree(r_waits2);
352: PetscFree(table);
353: PetscFree(s_status);
354: PetscFree(recv_status);
355: PetscFree(xdata[0]);
356: PetscFree(xdata);
357: PetscFree(isz1);
358: return(0);
359: }
363: /*
364: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
365: the work on the local processor.
367: Inputs:
368: C - MAT_MPIBAIJ;
369: imax - total no of index sets processed at a time;
370: table - an array of char - size = Mbs bits.
371:
372: Output:
373: isz - array containing the count of the solution elements corresponding
374: to each index set;
375: data - pointer to the solutions
376: */
377: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
378: {
379: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
380: Mat A = c->A,B = c->B;
381: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
382: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
383: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
384: PetscBT table_i;
387: rstart = c->rstartbs;
388: cstart = c->cstartbs;
389: ai = a->i;
390: aj = a->j;
391: bi = b->i;
392: bj = b->j;
393: garray = c->garray;
395:
396: for (i=0; i<imax; i++) {
397: data_i = data[i];
398: table_i = table[i];
399: isz_i = isz[i];
400: for (j=0,max=isz[i]; j<max; j++) {
401: row = data_i[j] - rstart;
402: start = ai[row];
403: end = ai[row+1];
404: for (k=start; k<end; k++) { /* Amat */
405: val = aj[k] + cstart;
406: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
407: }
408: start = bi[row];
409: end = bi[row+1];
410: for (k=start; k<end; k++) { /* Bmat */
411: val = garray[bj[k]];
412: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
413: }
414: }
415: isz[i] = isz_i;
416: }
417: return(0);
418: }
421: /*
422: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
423: and return the output
425: Input:
426: C - the matrix
427: nrqr - no of messages being processed.
428: rbuf - an array of pointers to the recieved requests
429:
430: Output:
431: xdata - array of messages to be sent back
432: isz1 - size of each message
434: For better efficiency perhaps we should malloc separately each xdata[i],
435: then if a remalloc is required we need only copy the data for that one row
436: rather than all previous rows as it is now where a single large chunck of
437: memory is used.
439: */
440: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
441: {
442: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
443: Mat A = c->A,B = c->B;
444: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
446: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
447: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
448: PetscInt val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
449: PetscInt *rbuf_i,kmax,rbuf_0;
450: PetscBT xtable;
453: rank = c->rank;
454: Mbs = c->Mbs;
455: rstart = c->rstartbs;
456: cstart = c->cstartbs;
457: ai = a->i;
458: aj = a->j;
459: bi = b->i;
460: bj = b->j;
461: garray = c->garray;
462:
463:
464: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
465: rbuf_i = rbuf[i];
466: rbuf_0 = rbuf_i[0];
467: ct += rbuf_0;
468: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
469: }
470:
471: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
472: else max1 = 1;
473: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
474: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
475: ++no_malloc;
476: PetscBTCreate(Mbs,xtable);
477: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
478:
479: ct3 = 0;
480: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
481: rbuf_i = rbuf[i];
482: rbuf_0 = rbuf_i[0];
483: ct1 = 2*rbuf_0+1;
484: ct2 = ct1;
485: ct3 += ct1;
486: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
487: PetscBTMemzero(Mbs,xtable);
488: oct2 = ct2;
489: kmax = rbuf_i[2*j];
490: for (k=0; k<kmax; k++,ct1++) {
491: row = rbuf_i[ct1];
492: if (!PetscBTLookupSet(xtable,row)) {
493: if (!(ct3 < mem_estimate)) {
494: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
495: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
496: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
497: PetscFree(xdata[0]);
498: xdata[0] = tmp;
499: mem_estimate = new_estimate; ++no_malloc;
500: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
501: }
502: xdata[i][ct2++] = row;
503: ct3++;
504: }
505: }
506: for (k=oct2,max2=ct2; k<max2; k++) {
507: row = xdata[i][k] - rstart;
508: start = ai[row];
509: end = ai[row+1];
510: for (l=start; l<end; l++) {
511: val = aj[l] + cstart;
512: if (!PetscBTLookupSet(xtable,val)) {
513: if (!(ct3 < mem_estimate)) {
514: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
515: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
516: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
517: PetscFree(xdata[0]);
518: xdata[0] = tmp;
519: mem_estimate = new_estimate; ++no_malloc;
520: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
521: }
522: xdata[i][ct2++] = val;
523: ct3++;
524: }
525: }
526: start = bi[row];
527: end = bi[row+1];
528: for (l=start; l<end; l++) {
529: val = garray[bj[l]];
530: if (!PetscBTLookupSet(xtable,val)) {
531: if (!(ct3 < mem_estimate)) {
532: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
533: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
534: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
535: PetscFree(xdata[0]);
536: xdata[0] = tmp;
537: mem_estimate = new_estimate; ++no_malloc;
538: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
539: }
540: xdata[i][ct2++] = val;
541: ct3++;
542: }
543: }
544: }
545: /* Update the header*/
546: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
547: xdata[i][2*j-1] = rbuf_i[2*j-1];
548: }
549: xdata[i][0] = rbuf_0;
550: xdata[i+1] = xdata[i] + ct2;
551: isz1[i] = ct2; /* size of each message */
552: }
553: PetscBTDestroy(xtable);
554: PetscInfo3(0,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
555: return(0);
556: }
558: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
562: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
563: {
564: IS *isrow_new,*iscol_new;
565: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
567: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap.N,bs=C->rmap.bs;
570: /* The compression and expansion should be avoided. Does'nt point
571: out errors might change the indices hence buggey */
573: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
574: iscol_new = isrow_new + ismax;
575: ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);
576: ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);
578: /* Allocate memory to hold all the submatrices */
579: if (scall != MAT_REUSE_MATRIX) {
580: PetscMalloc((ismax+1)*sizeof(Mat),submat);
581: }
582: /* Determine the number of stages through which submatrices are done */
583: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
584: if (!nmax) nmax = 1;
585: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
586:
587: /* Make sure every processor loops through the nstages */
588: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,C->comm);
589: for (i=0,pos=0; i<nstages; i++) {
590: if (pos+nmax <= ismax) max_no = nmax;
591: else if (pos == ismax) max_no = 0;
592: else max_no = ismax-pos;
593: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
594: pos += max_no;
595: }
596:
597: for (i=0; i<ismax; i++) {
598: ISDestroy(isrow_new[i]);
599: ISDestroy(iscol_new[i]);
600: }
601: PetscFree(isrow_new);
602: return(0);
603: }
605: #if defined (PETSC_USE_CTABLE)
608: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
609: {
610: PetscInt nGlobalNd = proc_gnode[size];
611: PetscMPIInt fproc = (PetscMPIInt) ((float)row * (float)size / (float)nGlobalNd + 0.5);
612:
614: if (fproc > size) fproc = size;
615: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
616: if (row < proc_gnode[fproc]) fproc--;
617: else fproc++;
618: }
619: *rank = fproc;
620: return(0);
621: }
622: #endif
624: /* -------------------------------------------------------------------------*/
625: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
628: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
629: {
630: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
631: Mat A = c->A;
632: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
633: PetscInt **irow,**icol,*nrow,*ncol,*w3,*w4,start;
635: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
636: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
637: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
638: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
639: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
640: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
641: PetscInt bs=C->rmap.bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
642: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
643: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
644: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
645: MPI_Request *r_waits4,*s_waits3,*s_waits4;
646: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
647: MPI_Status *r_status3,*r_status4,*s_status4;
648: MPI_Comm comm;
649: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
650: MatScalar *a_a=a->a,*b_a=b->a;
651: PetscTruth flag;
652: PetscMPIInt *onodes1,*olengths1;
654: #if defined (PETSC_USE_CTABLE)
655: PetscInt tt;
656: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
657: #else
658: PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
659: #endif
662: comm = C->comm;
663: tag0 = C->tag;
664: size = c->size;
665: rank = c->rank;
666:
667: /* Get some new tags to keep the communication clean */
668: PetscObjectGetNewTag((PetscObject)C,&tag1);
669: PetscObjectGetNewTag((PetscObject)C,&tag2);
670: PetscObjectGetNewTag((PetscObject)C,&tag3);
672: /* Check if the col indices are sorted */
673: for (i=0; i<ismax; i++) {
674: ISSorted(iscol[i],(PetscTruth*)&j);
675: if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
676: }
678: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
679: #if !defined (PETSC_USE_CTABLE)
680: len += (Mbs+1)*sizeof(PetscInt);
681: #endif
682: PetscMalloc(len,&irow);
683: icol = irow + ismax;
684: nrow = (PetscInt*)(icol + ismax);
685: ncol = nrow + ismax;
686: #if !defined (PETSC_USE_CTABLE)
687: rtable = ncol + ismax;
688: /* Create hash table for the mapping :row -> proc*/
689: for (i=0,j=0; i<size; i++) {
690: jmax = c->rowners[i+1];
691: for (; j<jmax; j++) {
692: rtable[j] = i;
693: }
694: }
695: #endif
696:
697: for (i=0; i<ismax; i++) {
698: ISGetIndices(isrow[i],&irow[i]);
699: ISGetIndices(iscol[i],&icol[i]);
700: ISGetLocalSize(isrow[i],&nrow[i]);
701: ISGetLocalSize(iscol[i],&ncol[i]);
702: }
704: /* evaluate communication - mesg to who,length of mesg,and buffer space
705: required. Based on this, buffers are allocated, and data copied into them*/
706: PetscMalloc(size*2*sizeof(PetscInt) + size*2*sizeof(PetscMPIInt),&w1); /* mesg size */
707: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
708: w3 = (PetscInt*)(w2 + size); /* no of IS that needs to be sent to proc i */
709: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
710: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
711: for (i=0; i<ismax; i++) {
712: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
713: jmax = nrow[i];
714: irow_i = irow[i];
715: for (j=0; j<jmax; j++) {
716: row = irow_i[j];
717: #if defined (PETSC_USE_CTABLE)
718: PetscGetProc(row,size,c->rangebs,&proc);
719: #else
720: proc = rtable[row];
721: #endif
722: w4[proc]++;
723: }
724: for (j=0; j<size; j++) {
725: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
726: }
727: }
729: nrqs = 0; /* no of outgoing messages */
730: msz = 0; /* total mesg length for all proc */
731: w1[rank] = 0; /* no mesg sent to intself */
732: w3[rank] = 0;
733: for (i=0; i<size; i++) {
734: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
735: }
736: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
737: for (i=0,j=0; i<size; i++) {
738: if (w1[i]) { pa[j] = i; j++; }
739: }
741: /* Each message would have a header = 1 + 2*(no of IS) + data */
742: for (i=0; i<nrqs; i++) {
743: j = pa[i];
744: w1[j] += w2[j] + 2* w3[j];
745: msz += w1[j];
746: }
748: /* Determine the number of messages to expect, their lengths, from from-ids */
749: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
750: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
752: /* Now post the Irecvs corresponding to these messages */
753: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
754:
755: PetscFree(onodes1);
756: PetscFree(olengths1);
758: /* Allocate Memory for outgoing messages */
759: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
760: PetscMalloc(len,&sbuf1);
761: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
762: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
763: /* allocate memory for outgoing data + buf to receive the first reply */
764: tmp = (PetscInt*)(ptr + size);
765: ctr = tmp + 2*msz;
767: {
768: PetscInt *iptr = tmp,ict = 0;
769: for (i=0; i<nrqs; i++) {
770: j = pa[i];
771: iptr += ict;
772: sbuf1[j] = iptr;
773: ict = w1[j];
774: }
775: }
777: /* Form the outgoing messages */
778: /* Initialise the header space */
779: for (i=0; i<nrqs; i++) {
780: j = pa[i];
781: sbuf1[j][0] = 0;
782: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
783: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
784: }
785:
786: /* Parse the isrow and copy data into outbuf */
787: for (i=0; i<ismax; i++) {
788: PetscMemzero(ctr,size*sizeof(PetscInt));
789: irow_i = irow[i];
790: jmax = nrow[i];
791: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
792: row = irow_i[j];
793: #if defined (PETSC_USE_CTABLE)
794: PetscGetProc(row,size,c->rangebs,&proc);
795: #else
796: proc = rtable[row];
797: #endif
798: if (proc != rank) { /* copy to the outgoing buf*/
799: ctr[proc]++;
800: *ptr[proc] = row;
801: ptr[proc]++;
802: }
803: }
804: /* Update the headers for the current IS */
805: for (j=0; j<size; j++) { /* Can Optimise this loop too */
806: if ((ctr_j = ctr[j])) {
807: sbuf1_j = sbuf1[j];
808: k = ++sbuf1_j[0];
809: sbuf1_j[2*k] = ctr_j;
810: sbuf1_j[2*k-1] = i;
811: }
812: }
813: }
815: /* Now post the sends */
816: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
817: for (i=0; i<nrqs; ++i) {
818: j = pa[i];
819: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
820: }
822: /* Post Recieves to capture the buffer size */
823: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
824: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
825: rbuf2[0] = tmp + msz;
826: for (i=1; i<nrqs; ++i) {
827: j = pa[i];
828: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
829: }
830: for (i=0; i<nrqs; ++i) {
831: j = pa[i];
832: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
833: }
835: /* Send to other procs the buf size they should allocate */
837: /* Receive messages*/
838: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
839: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
840: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
841: PetscMalloc(len,&sbuf2);
842: req_size = (PetscInt*)(sbuf2 + nrqr);
843: req_source = req_size + nrqr;
844:
845: {
846: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
847: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
849: for (i=0; i<nrqr; ++i) {
850: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
851: req_size[idex] = 0;
852: rbuf1_i = rbuf1[idex];
853: start = 2*rbuf1_i[0] + 1;
854: MPI_Get_count(r_status1+i,MPIU_INT,&end);
855: PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);
856: sbuf2_i = sbuf2[idex];
857: for (j=start; j<end; j++) {
858: id = rbuf1_i[j] - rstart;
859: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
860: sbuf2_i[j] = ncols;
861: req_size[idex] += ncols;
862: }
863: req_source[idex] = r_status1[i].MPI_SOURCE;
864: /* form the header */
865: sbuf2_i[0] = req_size[idex];
866: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
867: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
868: }
869: }
870: PetscFree(r_status1);
871: PetscFree(r_waits1);
873: /* recv buffer sizes */
874: /* Receive messages*/
876: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
877: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
878: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
879: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
880: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
882: for (i=0; i<nrqs; ++i) {
883: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
884: PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);
885: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
886: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
887: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
888: }
889: PetscFree(r_status2);
890: PetscFree(r_waits2);
891:
892: /* Wait on sends1 and sends2 */
893: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
894: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
896: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
897: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
898: PetscFree(s_status1);
899: PetscFree(s_status2);
900: PetscFree(s_waits1);
901: PetscFree(s_waits2);
903: /* Now allocate buffers for a->j, and send them off */
904: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
905: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
906: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
907: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
908:
909: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
910: {
911: for (i=0; i<nrqr; i++) {
912: rbuf1_i = rbuf1[i];
913: sbuf_aj_i = sbuf_aj[i];
914: ct1 = 2*rbuf1_i[0] + 1;
915: ct2 = 0;
916: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
917: kmax = rbuf1[i][2*j];
918: for (k=0; k<kmax; k++,ct1++) {
919: row = rbuf1_i[ct1] - rstart;
920: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
921: ncols = nzA + nzB;
922: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
924: /* load the column indices for this row into cols*/
925: cols = sbuf_aj_i + ct2;
926: for (l=0; l<nzB; l++) {
927: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
928: else break;
929: }
930: imark = l;
931: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
932: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
933: ct2 += ncols;
934: }
935: }
936: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
937: }
938: }
939: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
940: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
942: /* Allocate buffers for a->a, and send them off */
943: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
944: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
945: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
946: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
947:
948: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
949: {
950: for (i=0; i<nrqr; i++) {
951: rbuf1_i = rbuf1[i];
952: sbuf_aa_i = sbuf_aa[i];
953: ct1 = 2*rbuf1_i[0]+1;
954: ct2 = 0;
955: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
956: kmax = rbuf1_i[2*j];
957: for (k=0; k<kmax; k++,ct1++) {
958: row = rbuf1_i[ct1] - rstart;
959: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
960: ncols = nzA + nzB;
961: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
962: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
964: /* load the column values for this row into vals*/
965: vals = sbuf_aa_i+ct2*bs2;
966: for (l=0; l<nzB; l++) {
967: if ((bmap[cworkB[l]]) < cstart) {
968: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
969: }
970: else break;
971: }
972: imark = l;
973: for (l=0; l<nzA; l++) {
974: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
975: }
976: for (l=imark; l<nzB; l++) {
977: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
978: }
979: ct2 += ncols;
980: }
981: }
982: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
983: }
984: }
985: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
986: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
987: PetscFree(rbuf1);
989: /* Form the matrix */
990: /* create col map */
991: {
992: PetscInt *icol_i;
993: #if defined (PETSC_USE_CTABLE)
994: /* Create row map*/
995: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
996: for (i=0; i<ismax; i++) {
997: PetscTableCreate(ncol[i]+1,&colmaps[i]);
998: }
999: #else
1000: len = (1+ismax)*sizeof(PetscInt*)+ ismax*c->Nbs*sizeof(PetscInt);
1001: PetscMalloc(len,&cmap);
1002: cmap[0] = (PetscInt*)(cmap + ismax);
1003: PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(PetscInt));
1004: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1005: #endif
1006: for (i=0; i<ismax; i++) {
1007: jmax = ncol[i];
1008: icol_i = icol[i];
1009: #if defined (PETSC_USE_CTABLE)
1010: lcol1_gcol1 = colmaps[i];
1011: for (j=0; j<jmax; j++) {
1012: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1013: }
1014: #else
1015: cmap_i = cmap[i];
1016: for (j=0; j<jmax; j++) {
1017: cmap_i[icol_i[j]] = j+1;
1018: }
1019: #endif
1020: }
1021: }
1023: /* Create lens which is required for MatCreate... */
1024: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1025: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1026: PetscMalloc(len,&lens);
1027: lens[0] = (PetscInt*)(lens + ismax);
1028: PetscMemzero(lens[0],j*sizeof(PetscInt));
1029: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1030:
1031: /* Update lens from local data */
1032: for (i=0; i<ismax; i++) {
1033: jmax = nrow[i];
1034: #if defined (PETSC_USE_CTABLE)
1035: lcol1_gcol1 = colmaps[i];
1036: #else
1037: cmap_i = cmap[i];
1038: #endif
1039: irow_i = irow[i];
1040: lens_i = lens[i];
1041: for (j=0; j<jmax; j++) {
1042: row = irow_i[j];
1043: #if defined (PETSC_USE_CTABLE)
1044: PetscGetProc(row,size,c->rangebs,&proc);
1045: #else
1046: proc = rtable[row];
1047: #endif
1048: if (proc == rank) {
1049: /* Get indices from matA and then from matB */
1050: row = row - rstart;
1051: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1052: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1053: #if defined (PETSC_USE_CTABLE)
1054: for (k=0; k<nzA; k++) {
1055: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1056: if (tt) { lens_i[j]++; }
1057: }
1058: for (k=0; k<nzB; k++) {
1059: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1060: if (tt) { lens_i[j]++; }
1061: }
1062: #else
1063: for (k=0; k<nzA; k++) {
1064: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1065: }
1066: for (k=0; k<nzB; k++) {
1067: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1068: }
1069: #endif
1070: }
1071: }
1072: }
1073: #if defined (PETSC_USE_CTABLE)
1074: /* Create row map*/
1075: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1076: for (i=0; i<ismax; i++){
1077: PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1078: }
1079: #else
1080: /* Create row map*/
1081: len = (1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt);
1082: PetscMalloc(len,&rmap);
1083: rmap[0] = (PetscInt*)(rmap + ismax);
1084: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1085: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1086: #endif
1087: for (i=0; i<ismax; i++) {
1088: irow_i = irow[i];
1089: jmax = nrow[i];
1090: #if defined (PETSC_USE_CTABLE)
1091: lrow1_grow1 = rowmaps[i];
1092: for (j=0; j<jmax; j++) {
1093: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1094: }
1095: #else
1096: rmap_i = rmap[i];
1097: for (j=0; j<jmax; j++) {
1098: rmap_i[irow_i[j]] = j;
1099: }
1100: #endif
1101: }
1103: /* Update lens from offproc data */
1104: {
1105: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1106: PetscMPIInt ii;
1108: for (tmp2=0; tmp2<nrqs; tmp2++) {
1109: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1110: idex = pa[ii];
1111: sbuf1_i = sbuf1[idex];
1112: jmax = sbuf1_i[0];
1113: ct1 = 2*jmax+1;
1114: ct2 = 0;
1115: rbuf2_i = rbuf2[ii];
1116: rbuf3_i = rbuf3[ii];
1117: for (j=1; j<=jmax; j++) {
1118: is_no = sbuf1_i[2*j-1];
1119: max1 = sbuf1_i[2*j];
1120: lens_i = lens[is_no];
1121: #if defined (PETSC_USE_CTABLE)
1122: lcol1_gcol1 = colmaps[is_no];
1123: lrow1_grow1 = rowmaps[is_no];
1124: #else
1125: cmap_i = cmap[is_no];
1126: rmap_i = rmap[is_no];
1127: #endif
1128: for (k=0; k<max1; k++,ct1++) {
1129: #if defined (PETSC_USE_CTABLE)
1130: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1131: row--;
1132: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1133: #else
1134: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1135: #endif
1136: max2 = rbuf2_i[ct1];
1137: for (l=0; l<max2; l++,ct2++) {
1138: #if defined (PETSC_USE_CTABLE)
1139: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1140: if (tt) {
1141: lens_i[row]++;
1142: }
1143: #else
1144: if (cmap_i[rbuf3_i[ct2]]) {
1145: lens_i[row]++;
1146: }
1147: #endif
1148: }
1149: }
1150: }
1151: }
1152: }
1153: PetscFree(r_status3);
1154: PetscFree(r_waits3);
1155: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1156: PetscFree(s_status3);
1157: PetscFree(s_waits3);
1159: /* Create the submatrices */
1160: if (scall == MAT_REUSE_MATRIX) {
1161: /*
1162: Assumes new rows are same length as the old rows, hence bug!
1163: */
1164: for (i=0; i<ismax; i++) {
1165: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1166: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap.bs != bs)) {
1167: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1168: }
1169: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1170: if (!flag) {
1171: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1172: }
1173: /* Initial matrix as if empty */
1174: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1175: submats[i]->factor = C->factor;
1176: }
1177: } else {
1178: for (i=0; i<ismax; i++) {
1179: MatCreate(PETSC_COMM_SELF,submats+i);
1180: MatSetSizes(submats[i],nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs);
1181: MatSetType(submats[i],A->type_name);
1182: MatSeqBAIJSetPreallocation(submats[i],C->rmap.bs,0,lens[i]);
1183: MatSeqSBAIJSetPreallocation(submats[i],C->rmap.bs,0,lens[i]);
1184: }
1185: }
1187: /* Assemble the matrices */
1188: /* First assemble the local rows */
1189: {
1190: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1191: MatScalar *imat_a;
1192:
1193: for (i=0; i<ismax; i++) {
1194: mat = (Mat_SeqBAIJ*)submats[i]->data;
1195: imat_ilen = mat->ilen;
1196: imat_j = mat->j;
1197: imat_i = mat->i;
1198: imat_a = mat->a;
1200: #if defined (PETSC_USE_CTABLE)
1201: lcol1_gcol1 = colmaps[i];
1202: lrow1_grow1 = rowmaps[i];
1203: #else
1204: cmap_i = cmap[i];
1205: rmap_i = rmap[i];
1206: #endif
1207: irow_i = irow[i];
1208: jmax = nrow[i];
1209: for (j=0; j<jmax; j++) {
1210: row = irow_i[j];
1211: #if defined (PETSC_USE_CTABLE)
1212: PetscGetProc(row,size,c->rangebs,&proc);
1213: #else
1214: proc = rtable[row];
1215: #endif
1216: if (proc == rank) {
1217: row = row - rstart;
1218: nzA = a_i[row+1] - a_i[row];
1219: nzB = b_i[row+1] - b_i[row];
1220: cworkA = a_j + a_i[row];
1221: cworkB = b_j + b_i[row];
1222: vworkA = a_a + a_i[row]*bs2;
1223: vworkB = b_a + b_i[row]*bs2;
1224: #if defined (PETSC_USE_CTABLE)
1225: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1226: row--;
1227: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1228: #else
1229: row = rmap_i[row + rstart];
1230: #endif
1231: mat_i = imat_i[row];
1232: mat_a = imat_a + mat_i*bs2;
1233: mat_j = imat_j + mat_i;
1234: ilen_row = imat_ilen[row];
1236: /* load the column indices for this row into cols*/
1237: for (l=0; l<nzB; l++) {
1238: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1239: #if defined (PETSC_USE_CTABLE)
1240: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1241: if (tcol) {
1242: #else
1243: if ((tcol = cmap_i[ctmp])) {
1244: #endif
1245: *mat_j++ = tcol - 1;
1246: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1247: mat_a += bs2;
1248: ilen_row++;
1249: }
1250: } else break;
1251: }
1252: imark = l;
1253: for (l=0; l<nzA; l++) {
1254: #if defined (PETSC_USE_CTABLE)
1255: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1256: if (tcol) {
1257: #else
1258: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1259: #endif
1260: *mat_j++ = tcol - 1;
1261: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1262: mat_a += bs2;
1263: ilen_row++;
1264: }
1265: }
1266: for (l=imark; l<nzB; l++) {
1267: #if defined (PETSC_USE_CTABLE)
1268: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1269: if (tcol) {
1270: #else
1271: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1272: #endif
1273: *mat_j++ = tcol - 1;
1274: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1275: mat_a += bs2;
1276: ilen_row++;
1277: }
1278: }
1279: imat_ilen[row] = ilen_row;
1280: }
1281: }
1282:
1283: }
1284: }
1286: /* Now assemble the off proc rows*/
1287: {
1288: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1289: PetscInt *imat_j,*imat_i;
1290: MatScalar *imat_a,*rbuf4_i;
1291: PetscMPIInt ii;
1293: for (tmp2=0; tmp2<nrqs; tmp2++) {
1294: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1295: idex = pa[ii];
1296: sbuf1_i = sbuf1[idex];
1297: jmax = sbuf1_i[0];
1298: ct1 = 2*jmax + 1;
1299: ct2 = 0;
1300: rbuf2_i = rbuf2[ii];
1301: rbuf3_i = rbuf3[ii];
1302: rbuf4_i = rbuf4[ii];
1303: for (j=1; j<=jmax; j++) {
1304: is_no = sbuf1_i[2*j-1];
1305: #if defined (PETSC_USE_CTABLE)
1306: lrow1_grow1 = rowmaps[is_no];
1307: lcol1_gcol1 = colmaps[is_no];
1308: #else
1309: rmap_i = rmap[is_no];
1310: cmap_i = cmap[is_no];
1311: #endif
1312: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1313: imat_ilen = mat->ilen;
1314: imat_j = mat->j;
1315: imat_i = mat->i;
1316: imat_a = mat->a;
1317: max1 = sbuf1_i[2*j];
1318: for (k=0; k<max1; k++,ct1++) {
1319: row = sbuf1_i[ct1];
1320: #if defined (PETSC_USE_CTABLE)
1321: PetscTableFind(lrow1_grow1,row+1,&row);
1322: row--;
1323: if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1324: #else
1325: row = rmap_i[row];
1326: #endif
1327: ilen = imat_ilen[row];
1328: mat_i = imat_i[row];
1329: mat_a = imat_a + mat_i*bs2;
1330: mat_j = imat_j + mat_i;
1331: max2 = rbuf2_i[ct1];
1332: for (l=0; l<max2; l++,ct2++) {
1333: #if defined (PETSC_USE_CTABLE)
1334: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1335: if (tcol) {
1336: #else
1337: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1338: #endif
1339: *mat_j++ = tcol - 1;
1340: /* *mat_a++= rbuf4_i[ct2]; */
1341: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1342: mat_a += bs2;
1343: ilen++;
1344: }
1345: }
1346: imat_ilen[row] = ilen;
1347: }
1348: }
1349: }
1350: }
1351: PetscFree(r_status4);
1352: PetscFree(r_waits4);
1353: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1354: PetscFree(s_waits4);
1355: PetscFree(s_status4);
1357: /* Restore the indices */
1358: for (i=0; i<ismax; i++) {
1359: ISRestoreIndices(isrow[i],irow+i);
1360: ISRestoreIndices(iscol[i],icol+i);
1361: }
1363: /* Destroy allocated memory */
1364: PetscFree(irow);
1365: PetscFree(w1);
1366: PetscFree(pa);
1368: PetscFree(sbuf1);
1369: PetscFree(rbuf2);
1370: for (i=0; i<nrqr; ++i) {
1371: PetscFree(sbuf2[i]);
1372: }
1373: for (i=0; i<nrqs; ++i) {
1374: PetscFree(rbuf3[i]);
1375: PetscFree(rbuf4[i]);
1376: }
1378: PetscFree(sbuf2);
1379: PetscFree(rbuf3);
1380: PetscFree(rbuf4);
1381: PetscFree(sbuf_aj[0]);
1382: PetscFree(sbuf_aj);
1383: PetscFree(sbuf_aa[0]);
1384: PetscFree(sbuf_aa);
1386: #if defined (PETSC_USE_CTABLE)
1387: for (i=0; i<ismax; i++){
1388: PetscTableDelete(rowmaps[i]);
1389: PetscTableDelete(colmaps[i]);
1390: }
1391: PetscFree(colmaps);
1392: PetscFree(rowmaps);
1393: #else
1394: PetscFree(rmap);
1395: PetscFree(cmap);
1396: #endif
1397: PetscFree(lens);
1399: for (i=0; i<ismax; i++) {
1400: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1401: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1402: }
1403: return(0);
1404: }