Actual source code: sbaijov.c
1: /*$Id: sbaijov.c,v 1.10 2001/08/06 21:15:59 bsmith Exp $*/
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static int MatIncreaseOverlap_MPISBAIJ_Once(Mat,int,IS *);
11: static int MatIncreaseOverlap_MPISBAIJ_Local(Mat,int,char **,int*,int**);
12: static int MatIncreaseOverlap_MPISBAIJ_Receive(Mat,int,int **,int**,int*);
13: extern int MatGetRow_MPISBAIJ(Mat,int,int*,int**,PetscScalar**);
14: extern int MatRestoreRow_MPISBAIJ(Mat,int,int*,int**,PetscScalar**);
15:
16: static int MatCompressIndicesGeneral_MPISBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
17: {
18: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
19: int ierr,isz,bs = baij->bs,Nbs,n,i,j,*idx,*nidx,ival;
20: PetscBT table;
23: Nbs = baij->Nbs;
24: PetscMalloc((Nbs+1)*sizeof(int),&nidx);
25: PetscBTCreate(Nbs,table);
27: for (i=0; i<imax; i++) {
28: isz = 0;
29: PetscBTMemzero(Nbs,table);
30: ISGetIndices(is_in[i],&idx);
31: ISGetSize(is_in[i],&n);
32: for (j=0; j<n ; j++) {
33: ival = idx[j]/bs; /* convert the indices into block indices */
34: if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
35: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
36: }
37: ISRestoreIndices(is_in[i],&idx);
38: ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));
39: }
40: PetscBTDestroy(table);
41: PetscFree(nidx);
42: return(0);
43: }
45: static int MatCompressIndicesSorted_MPISBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
46: {
47: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
48: int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,Nbs=baij->Nbs,*idx_local;
49: PetscTruth flg;
52: for (i=0; i<imax; i++) {
53: ISSorted(is_in[i],&flg);
54: if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Indices are not sorted");
55: }
56: PetscMalloc((Nbs+1)*sizeof(int),&nidx);
57: /* Now check if the indices are in block order */
58: for (i=0; i<imax; i++) {
59: ISGetIndices(is_in[i],&idx);
60: ISGetSize(is_in[i],&n);
61: if (n%bs !=0) SETERRQ(1,"Indices are not block ordered");
63: n = n/bs; /* The reduced index size */
64: idx_local = idx;
65: for (j=0; j<n ; j++) {
66: val = idx_local[0];
67: if (val%bs != 0) SETERRQ(1,"Indices are not block ordered");
68: for (k=0; k<bs; k++) {
69: if (val+k != idx_local[k]) SETERRQ(1,"Indices are not block ordered");
70: }
71: nidx[j] = val/bs;
72: idx_local +=bs;
73: }
74: ISRestoreIndices(is_in[i],&idx);
75: ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));
76: }
77: PetscFree(nidx);
78: return(0);
79: }
81: static int MatExpandIndices_MPISBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
82: {
83: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
84: int ierr,bs = baij->bs,Nbs,n,i,j,k,*idx,*nidx;
87: Nbs = baij->Nbs;
89: PetscMalloc((Nbs*bs+1)*sizeof(int),&nidx);
91: for (i=0; i<imax; i++) {
92: ISGetIndices(is_in[i],&idx);
93: ISGetSize(is_in[i],&n);
94: for (j=0; j<n ; ++j){
95: for (k=0; k<bs; k++)
96: nidx[j*bs+k] = idx[j]*bs+k;
97: }
98: ISRestoreIndices(is_in[i],&idx);
99: ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);
100: }
101: PetscFree(nidx);
102: return(0);
103: }
106: int MatIncreaseOverlap_MPISBAIJ(Mat C,int imax,IS *is,int ov)
107: {
108: int i,ierr;
109: IS *is_new;
112: PetscMalloc(imax*sizeof(IS),&is_new);
113: /* Convert the indices into block format */
114: MatCompressIndicesGeneral_MPISBAIJ(C,imax,is,is_new);
115: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specifiedn");}
116: for (i=0; i<ov; ++i) {
117: MatIncreaseOverlap_MPISBAIJ_Once(C,imax,is_new);
118: }
119: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
120: MatExpandIndices_MPISBAIJ(C,imax,is_new,is);
121: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
122: PetscFree(is_new);
123: return(0);
124: }
126: /*
127: Sample message format:
128: If a processor A wants processor B to process some elements corresponding
129: to index sets 1s[1], is[5]
130: mesg [0] = 2 (no of index sets in the mesg)
131: -----------
132: mesg [1] = 1 => is[1]
133: mesg [2] = sizeof(is[1]);
134: -----------
135: mesg [5] = 5 => is[5]
136: mesg [6] = sizeof(is[5]);
137: -----------
138: mesg [7]
139: mesg [n] datas[1]
140: -----------
141: mesg[n+1]
142: mesg[m] data(is[5])
143: -----------
144:
145: Notes:
146: nrqs - no of requests sent (or to be sent out)
147: nrqr - no of requests recieved (which have to be or which have been processed
148: */
149: static int MatIncreaseOverlap_MPISBAIJ_Once(Mat C,int imax,IS *is)
150: {
151: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
152: int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
153: int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
154: int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
155: int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
156: PetscBT *table;
157: MPI_Comm comm;
158: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
159: MPI_Status *s_status,*recv_status;
162: comm = C->comm;
163: size = c->size;
164: rank = c->rank;
165: Mbs = c->Mbs;
167: PetscObjectGetNewTag((PetscObject)C,&tag1);
168: PetscObjectGetNewTag((PetscObject)C,&tag2);
170: len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
171: ierr = PetscMalloc(len,&idx);
172: n = (int*)(idx + imax);
173: rtable = n + imax;
174:
175: for (i=0; i<imax; i++) {
176: ISGetIndices(is[i],&idx[i]);
177: ISGetSize(is[i],&n[i]);
178: }
179:
180: /* Create hash table for the mapping :row -> proc*/
181: for (i=0,j=0; i<size; i++) {
182: len = c->rowners[i+1];
183: for (; j<len; j++) {
184: rtable[j] = i;
185: }
186: }
188: /* evaluate communication - mesg to who,length of mesg, and buffer space
189: required. Based on this, buffers are allocated, and data copied into them*/
190: PetscMalloc(size*4*sizeof(int),&w1);/* mesg size */
191: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
192: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
193: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
194: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
195: for (i=0; i<imax; i++) {
196: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
197: idx_i = idx[i];
198: len = n[i];
199: for (j=0; j<len; j++) {
200: row = idx_i[j];
201: if (row < 0) {
202: SETERRQ(1,"Index set cannot have negative entries");
203: }
204: proc = rtable[row];
205: w4[proc]++;
206: }
207: for (j=0; j<size; j++){
208: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
209: }
210: }
212: nrqs = 0; /* no of outgoing messages */
213: msz = 0; /* total mesg length (for all proc */
214: w1[rank] = 0; /* no mesg sent to intself */
215: w3[rank] = 0;
216: for (i=0; i<size; i++) {
217: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
218: }
219: /* pa - is list of processors to communicate with */
220: PetscMalloc((nrqs+1)*sizeof(int),&pa);
221: for (i=0,j=0; i<size; i++) {
222: if (w1[i]) {pa[j] = i; j++;}
223: }
225: /* Each message would have a header = 1 + 2*(no of IS) + data */
226: for (i=0; i<nrqs; i++) {
227: j = pa[i];
228: w1[j] += w2[j] + 2*w3[j];
229: msz += w1[j];
230: }
231:
232:
233: /* Determine the number of messages to expect, their lengths, from from-ids */
234: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
235: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
237: /* Now post the Irecvs corresponding to these messages */
238: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
239:
240: /* Allocate Memory for outgoing messages */
241: len = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
242: PetscMalloc(len,&outdat);
243: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
244: PetscMemzero(outdat,2*size*sizeof(int*));
245: tmp = (int*)(outdat + 2*size);
246: ctr = tmp + msz;
248: {
249: int *iptr = tmp,ict = 0;
250: for (i=0; i<nrqs; i++) {
251: j = pa[i];
252: iptr += ict;
253: outdat[j] = iptr;
254: ict = w1[j];
255: }
256: }
258: /* Form the outgoing messages */
259: /*plug in the headers*/
260: for (i=0; i<nrqs; i++) {
261: j = pa[i];
262: outdat[j][0] = 0;
263: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
264: ptr[j] = outdat[j] + 2*w3[j] + 1;
265: }
266:
267: /* Memory for doing local proc's work*/
268: {
269: int *d_p;
270: char *t_p;
272: len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
273: (Mbs)*imax*sizeof(int) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
274: PetscMalloc(len,&table);
275: PetscMemzero(table,len);
276: data = (int **)(table + imax);
277: isz = (int *)(data + imax);
278: d_p = (int *)(isz + imax);
279: t_p = (char *)(d_p + Mbs*imax);
280: for (i=0; i<imax; i++) {
281: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
282: data[i] = d_p + (Mbs)*i;
283: }
284: }
286: /* Parse the IS and update local tables and the outgoing buf with the data*/
287: {
288: int n_i,*data_i,isz_i,*outdat_j,ctr_j;
289: PetscBT table_i;
291: for (i=0; i<imax; i++) {
292: ierr = PetscMemzero(ctr,size*sizeof(int));
293: n_i = n[i];
294: table_i = table[i];
295: idx_i = idx[i];
296: data_i = data[i];
297: isz_i = isz[i];
298: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
299: row = idx_i[j];
300: proc = rtable[row];
301: if (proc != rank) { /* copy to the outgoing buffer */
302: ctr[proc]++;
303: *ptr[proc] = row;
304: ptr[proc]++;
305: }
306: else { /* Update the local table */
307: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
308: }
309: }
310: /* Update the headers for the current IS */
311: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
312: if ((ctr_j = ctr[j])) {
313: outdat_j = outdat[j];
314: k = ++outdat_j[0];
315: outdat_j[2*k] = ctr_j;
316: outdat_j[2*k-1] = i;
317: }
318: }
319: isz[i] = isz_i;
320: }
321: }
322:
323: /* Now post the sends */
324: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
325: for (i=0; i<nrqs; ++i) {
326: j = pa[i];
327: MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
328: }
329:
330: /* No longer need the original indices*/
331: for (i=0; i<imax; ++i) {
332: ISRestoreIndices(is[i],idx+i);
333: }
334: PetscFree(idx);
336: for (i=0; i<imax; ++i) {
337: ISDestroy(is[i]);
338: }
339:
340: /* Do Local work*/
341: MatIncreaseOverlap_MPISBAIJ_Local(C,imax,table,isz,data);
343: /* Receive messages*/
344: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
345: MPI_Waitall(nrqr,r_waits1,recv_status);
347: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
348: MPI_Waitall(nrqs,s_waits1,s_status);
350: /* Phase 1 sends are complete - deallocate buffers */
351: PetscFree(outdat);
352: PetscFree(w1);
354: PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
355: PetscMalloc((nrqr+1)*sizeof(int),&isz1);
356: MatIncreaseOverlap_MPISBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
357: PetscFree(rbuf);
359: /* Send the data back*/
360: /* Do a global reduction to know the buffer space req for incoming messages*/
361: {
362: int *rw1;
363:
364: PetscMalloc(size*sizeof(int),&rw1);
365: PetscMemzero(rw1,size*sizeof(int));
367: for (i=0; i<nrqr; ++i) {
368: proc = recv_status[i].MPI_SOURCE;
369: if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
370: rw1[proc] = isz1[i];
371: }
373: PetscFree(onodes1);
374: PetscFree(olengths1);
376: /* Determine the number of messages to expect, their lengths, from from-ids */
377: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
378: PetscFree(rw1);
379: }
380: /* Now post the Irecvs corresponding to these messages */
381: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
382:
383: /* Now post the sends */
384: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
385: for (i=0; i<nrqr; ++i) {
386: j = recv_status[i].MPI_SOURCE;
387: MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
388: }
390: /* receive work done on other processors*/
391: {
392: int index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
393: PetscBT table_i;
394: MPI_Status *status2;
395:
396: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&status2);
398: for (i=0; i<nrqs; ++i) {
399: MPI_Waitany(nrqs,r_waits2,&index,status2+i);
400: /* Process the message*/
401: rbuf2_i = rbuf2[index];
402: ct1 = 2*rbuf2_i[0]+1;
403: jmax = rbuf2[index][0];
404: for (j=1; j<=jmax; j++) {
405: max = rbuf2_i[2*j];
406: is_no = rbuf2_i[2*j-1];
407: isz_i = isz[is_no];
408: data_i = data[is_no];
409: table_i = table[is_no];
410: for (k=0; k<max; k++,ct1++) {
411: row = rbuf2_i[ct1];
412: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
413: }
414: isz[is_no] = isz_i;
415: }
416: }
417: MPI_Waitall(nrqr,s_waits2,status2);
418: PetscFree(status2);
419: }
420:
421: for (i=0; i<imax; ++i) {
422: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
423: }
425:
426: PetscFree(onodes2);
427: PetscFree(olengths2);
428:
429: PetscFree(pa);
430: PetscFree(rbuf2);
431: PetscFree(s_waits1);
432: PetscFree(r_waits1);
433: PetscFree(s_waits2);
434: PetscFree(r_waits2);
435: PetscFree(table);
436: PetscFree(s_status);
437: PetscFree(recv_status);
438: PetscFree(xdata[0]);
439: PetscFree(xdata);
440: PetscFree(isz1);
441: return(0);
442: }
444: /*
445: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
446: the work on the local processor.
448: Inputs:
449: C - MAT_MPIBAIJ;
450: imax - total no of index sets processed at a time;
451: table - an array of char - size = Mbs bits.
452:
453: Output:
454: isz - array containing the count of the solution elements correspondign
455: to each index set;
456: data - pointer to the solutions
457: */
458: static int MatIncreaseOverlap_MPISBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
459: {
460: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
461: Mat A = c->A,B = c->B;
462: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
463: int start,end,val,max,rstart,cstart,*ai,*aj;
464: int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
465: PetscBT table_i;
468: rstart = c->rstart;
469: cstart = c->cstart;
470: ai = a->i;
471: aj = a->j;
472: bi = b->i;
473: bj = b->j;
474: garray = c->garray;
476:
477: for (i=0; i<imax; i++) {
478: data_i = data[i];
479: table_i = table[i];
480: isz_i = isz[i];
481: for (j=0,max=isz[i]; j<max; j++) {
482: row = data_i[j] - rstart;
483: start = ai[row];
484: end = ai[row+1];
485: for (k=start; k<end; k++) { /* Amat */
486: val = aj[k] + cstart;
487: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
488: }
489: start = bi[row];
490: end = bi[row+1];
491: for (k=start; k<end; k++) { /* Bmat */
492: val = garray[bj[k]];
493: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
494: }
495: }
496: isz[i] = isz_i;
497: }
498: return(0);
499: }
500: /*
501: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
502: and return the output
504: Input:
505: C - the matrix
506: nrqr - no of messages being processed.
507: rbuf - an array of pointers to the recieved requests
508:
509: Output:
510: xdata - array of messages to be sent back
511: isz1 - size of each message
513: For better efficiency perhaps we should malloc seperately each xdata[i],
514: then if a remalloc is required we need only copy the data for that one row
515: rather then all previous rows as it is now where a single large chunck of
516: memory is used.
518: */
519: static int MatIncreaseOverlap_MPISBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
520: {
521: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
522: Mat A = c->A,B = c->B;
523: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
524: int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
525: int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
526: int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
527: int *rbuf_i,kmax,rbuf_0,ierr;
528: PetscBT xtable;
531: rank = c->rank;
532: Mbs = c->Mbs;
533: rstart = c->rstart;
534: cstart = c->cstart;
535: ai = a->i;
536: aj = a->j;
537: bi = b->i;
538: bj = b->j;
539: garray = c->garray;
540:
541:
542: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
543: rbuf_i = rbuf[i];
544: rbuf_0 = rbuf_i[0];
545: ct += rbuf_0;
546: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
547: }
548:
549: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
550: else max1 = 1;
551: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
552: PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
553: ++no_malloc;
554: PetscBTCreate(Mbs,xtable);
555: PetscMemzero(isz1,nrqr*sizeof(int));
556:
557: ct3 = 0;
558: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
559: rbuf_i = rbuf[i];
560: rbuf_0 = rbuf_i[0];
561: ct1 = 2*rbuf_0+1;
562: ct2 = ct1;
563: ct3 += ct1;
564: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
565: PetscBTMemzero(Mbs,xtable);
566: oct2 = ct2;
567: kmax = rbuf_i[2*j];
568: for (k=0; k<kmax; k++,ct1++) {
569: row = rbuf_i[ct1];
570: if (!PetscBTLookupSet(xtable,row)) {
571: if (!(ct3 < mem_estimate)) {
572: new_estimate = (int)(1.5*mem_estimate)+1;
573: PetscMalloc(new_estimate * sizeof(int),&tmp);
574: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
575: PetscFree(xdata[0]);
576: xdata[0] = tmp;
577: mem_estimate = new_estimate; ++no_malloc;
578: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
579: }
580: xdata[i][ct2++] = row;
581: ct3++;
582: }
583: }
584: for (k=oct2,max2=ct2; k<max2; k++) {
585: row = xdata[i][k] - rstart;
586: start = ai[row];
587: end = ai[row+1];
588: for (l=start; l<end; l++) {
589: val = aj[l] + cstart;
590: if (!PetscBTLookupSet(xtable,val)) {
591: if (!(ct3 < mem_estimate)) {
592: new_estimate = (int)(1.5*mem_estimate)+1;
593: ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);
594: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
595: ierr = PetscFree(xdata[0]);
596: xdata[0] = tmp;
597: mem_estimate = new_estimate; ++no_malloc;
598: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
599: }
600: xdata[i][ct2++] = val;
601: ct3++;
602: }
603: }
604: start = bi[row];
605: end = bi[row+1];
606: for (l=start; l<end; l++) {
607: val = garray[bj[l]];
608: if (!PetscBTLookupSet(xtable,val)) {
609: if (!(ct3 < mem_estimate)) {
610: new_estimate = (int)(1.5*mem_estimate)+1;
611: ierr = PetscMalloc(new_estimate * sizeof(int),&tmp);
612: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
613: ierr = PetscFree(xdata[0]);
614: xdata[0] = tmp;
615: mem_estimate = new_estimate; ++no_malloc;
616: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
617: }
618: xdata[i][ct2++] = val;
619: ct3++;
620: }
621: }
622: }
623: /* Update the header*/
624: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
625: xdata[i][2*j-1] = rbuf_i[2*j-1];
626: }
627: xdata[i][0] = rbuf_0;
628: xdata[i+1] = xdata[i] + ct2;
629: isz1[i] = ct2; /* size of each message */
630: }
631: PetscBTDestroy(xtable);
632: PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %dn",rank,mem_estimate,ct3,no_malloc);
633: return(0);
634: }
636: static int MatGetSubMatrices_MPISBAIJ_local(Mat,int,IS *,IS *,MatReuse,Mat *);
638: int MatGetSubMatrices_MPISBAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat)
639: {
640: IS *isrow_new,*iscol_new;
641: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
642: int nmax,nstages_local,nstages,i,pos,max_no,ierr;
645: /* The compression and expansion should be avoided. Does'nt point
646: out errors might change the indices hence buggey */
648: ierr = PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
649: iscol_new = isrow_new + ismax;
650: ierr = MatCompressIndicesSorted_MPISBAIJ(C,ismax,isrow,isrow_new);
651: ierr = MatCompressIndicesSorted_MPISBAIJ(C,ismax,iscol,iscol_new);
653: /* Allocate memory to hold all the submatrices */
654: if (scall != MAT_REUSE_MATRIX) {
655: PetscMalloc((ismax+1)*sizeof(Mat),submat);
656: }
657: /* Determine the number of stages through which submatrices are done */
658: nmax = 20*1000000 / (c->Nbs * sizeof(int));
659: if (!nmax) nmax = 1;
660: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
661:
662: /* Make sure every porcessor loops through the nstages */
663: MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
664:
665: for (i=0,pos=0; i<nstages; i++) {
666: if (pos+nmax <= ismax) max_no = nmax;
667: else if (pos == ismax) max_no = 0;
668: else max_no = ismax-pos;
669: MatGetSubMatrices_MPISBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
670: pos += max_no;
671: }
672:
673: for (i=0; i<ismax; i++) {
674: ISDestroy(isrow_new[i]);
675: ISDestroy(iscol_new[i]);
676: }
677: PetscFree(isrow_new);
678: return(0);
679: }
681: /* -------------------------------------------------------------------------*/
682: static int MatGetSubMatrices_MPISBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats)
683: {
684: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
685: Mat A = c->A;
686: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
687: int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
688: int **sbuf1,**sbuf2,rank,Mbs,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
689: int nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr;
690: int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
691: int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
692: int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
693: int *rmap_i,bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
694: int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
695: int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
696: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
697: MPI_Request *r_waits4,*s_waits3,*s_waits4;
698: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
699: MPI_Status *r_status3,*r_status4,*s_status4;
700: MPI_Comm comm;
701: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
702: MatScalar *a_a=a->a,*b_a=b->a;
703: PetscTruth flag;
706: comm = C->comm;
707: tag0 = C->tag;
708: size = c->size;
709: rank = c->rank;
710: Mbs = c->Mbs;
712: /* Get some new tags to keep the communication clean */
713: PetscObjectGetNewTag((PetscObject)C,&tag1);
714: PetscObjectGetNewTag((PetscObject)C,&tag2);
715: PetscObjectGetNewTag((PetscObject)C,&tag3);
717: /* Check if the col indices are sorted */
718: for (i=0; i<ismax; i++) {
719: ISSorted(iscol[i],(PetscTruth*)&j);
720: if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
721: }
723: len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (Mbs+1)*sizeof(int);
724: ierr = PetscMalloc(len,&irow);
725: icol = irow + ismax;
726: nrow = (int*)(icol + ismax);
727: ncol = nrow + ismax;
728: rtable = ncol + ismax;
730: for (i=0; i<ismax; i++) {
731: ISGetIndices(isrow[i],&irow[i]);
732: ISGetIndices(iscol[i],&icol[i]);
733: ISGetSize(isrow[i],&nrow[i]);
734: ISGetSize(iscol[i],&ncol[i]);
735: }
737: /* Create hash table for the mapping :row -> proc*/
738: for (i=0,j=0; i<size; i++) {
739: jmax = c->rowners[i+1];
740: for (; j<jmax; j++) {
741: rtable[j] = i;
742: }
743: }
745: /* evaluate communication - mesg to who,length of mesg,and buffer space
746: required. Based on this, buffers are allocated, and data copied into them*/
747: PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
748: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
749: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
750: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
751: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
752: for (i=0; i<ismax; i++) {
753: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
754: jmax = nrow[i];
755: irow_i = irow[i];
756: for (j=0; j<jmax; j++) {
757: row = irow_i[j];
758: proc = rtable[row];
759: w4[proc]++;
760: }
761: for (j=0; j<size; j++) {
762: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
763: }
764: }
765:
766: nrqs = 0; /* no of outgoing messages */
767: msz = 0; /* total mesg length for all proc */
768: w1[rank] = 0; /* no mesg sent to intself */
769: w3[rank] = 0;
770: for (i=0; i<size; i++) {
771: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
772: }
773: PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
774: for (i=0,j=0; i<size; i++) {
775: if (w1[i]) { pa[j] = i; j++; }
776: }
778: /* Each message would have a header = 1 + 2*(no of IS) + data */
779: for (i=0; i<nrqs; i++) {
780: j = pa[i];
781: w1[j] += w2[j] + 2* w3[j];
782: msz += w1[j];
783: }
784: /* Do a global reduction to determine how many messages to expect*/
785: {
786: int *rw1;
787: PetscMalloc(2*size*sizeof(int),&rw1);
788: MPI_Allreduce(w1,rw1,2*size,MPI_INT,PetscMaxSum_Op,comm);
789: bsz = rw1[rank];
790: nrqr = rw1[size+rank];
791: PetscFree(rw1);
792: }
794: /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */
795: len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int);
796: ierr = PetscMalloc(len,&rbuf1);
797: rbuf1[0] = (int*)(rbuf1 + nrqr);
798: for (i=1; i<nrqr; ++i) rbuf1[i] = rbuf1[i-1] + bsz;
799:
800: /* Post the receives */
801: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&r_waits1);
802: for (i=0; i<nrqr; ++i) {
803: MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);
804: }
806: /* Allocate Memory for outgoing messages */
807: len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
808: PetscMalloc(len,&sbuf1);
809: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
810: PetscMemzero(sbuf1,2*size*sizeof(int*));
811: /* allocate memory for outgoing data + buf to receive the first reply */
812: tmp = (int*)(ptr + size);
813: ctr = tmp + 2*msz;
815: {
816: int *iptr = tmp,ict = 0;
817: for (i=0; i<nrqs; i++) {
818: j = pa[i];
819: iptr += ict;
820: sbuf1[j] = iptr;
821: ict = w1[j];
822: }
823: }
825: /* Form the outgoing messages */
826: /* Initialise the header space */
827: for (i=0; i<nrqs; i++) {
828: j = pa[i];
829: sbuf1[j][0] = 0;
830: ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
831: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
832: }
833:
834: /* Parse the isrow and copy data into outbuf */
835: for (i=0; i<ismax; i++) {
836: ierr = PetscMemzero(ctr,size*sizeof(int));
837: irow_i = irow[i];
838: jmax = nrow[i];
839: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
840: row = irow_i[j];
841: proc = rtable[row];
842: if (proc != rank) { /* copy to the outgoing buf*/
843: ctr[proc]++;
844: *ptr[proc] = row;
845: ptr[proc]++;
846: }
847: }
848: /* Update the headers for the current IS */
849: for (j=0; j<size; j++) { /* Can Optimise this loop too */
850: if ((ctr_j = ctr[j])) {
851: sbuf1_j = sbuf1[j];
852: k = ++sbuf1_j[0];
853: sbuf1_j[2*k] = ctr_j;
854: sbuf1_j[2*k-1] = i;
855: }
856: }
857: }
859: /* Now post the sends */
860: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
861: for (i=0; i<nrqs; ++i) {
862: j = pa[i];
863: MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
864: }
866: /* Post Recieves to capture the buffer size */
867: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
868: PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
869: rbuf2[0] = tmp + msz;
870: for (i=1; i<nrqs; ++i) {
871: j = pa[i];
872: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
873: }
874: for (i=0; i<nrqs; ++i) {
875: j = pa[i];
876: MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
877: }
879: /* Send to other procs the buf size they should allocate */
880:
882: /* Receive messages*/
883: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
884: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
885: len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
886: ierr = PetscMalloc(len,&sbuf2);
887: req_size = (int*)(sbuf2 + nrqr);
888: req_source = req_size + nrqr;
889:
890: {
891: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
892: int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
894: for (i=0; i<nrqr; ++i) {
895: ierr = MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);
896: req_size[index] = 0;
897: rbuf1_i = rbuf1[index];
898: start = 2*rbuf1_i[0] + 1;
899: ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);
900: ierr = PetscMalloc(end*sizeof(int),sbuf2[index]);
901: sbuf2_i = sbuf2[index];
902: for (j=start; j<end; j++) {
903: id = rbuf1_i[j] - rstart;
904: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
905: sbuf2_i[j] = ncols;
906: req_size[index] += ncols;
907: }
908: req_source[index] = r_status1[i].MPI_SOURCE;
909: /* form the header */
910: sbuf2_i[0] = req_size[index];
911: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
912: MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);
913: }
914: }
915: PetscFree(r_status1);
916: PetscFree(r_waits1);
918: /* recv buffer sizes */
919: /* Receive messages*/
920:
921: PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
922: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
923: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
924: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
925: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
927: for (i=0; i<nrqs; ++i) {
928: MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);
929: PetscMalloc(rbuf2[index][0]*sizeof(int),&rbuf3[index]);
930: PetscMalloc(rbuf2[index][0]*bs2*sizeof(MatScalar),&rbuf4[index]);
931: MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT,
932: r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);
933: MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2,MPIU_MATSCALAR,
934: r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);
935: }
936: PetscFree(r_status2);
937: PetscFree(r_waits2);
938:
939: /* Wait on sends1 and sends2 */
940: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
941: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
943: MPI_Waitall(nrqs,s_waits1,s_status1);
944: MPI_Waitall(nrqr,s_waits2,s_status2);
945: PetscFree(s_status1);
946: PetscFree(s_status2);
947: PetscFree(s_waits1);
948: PetscFree(s_waits2);
950: /* Now allocate buffers for a->j, and send them off */
951: PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);
952: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
953: PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
954: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
955:
956: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
957: {
958: for (i=0; i<nrqr; i++) {
959: rbuf1_i = rbuf1[i];
960: sbuf_aj_i = sbuf_aj[i];
961: ct1 = 2*rbuf1_i[0] + 1;
962: ct2 = 0;
963: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
964: kmax = rbuf1[i][2*j];
965: for (k=0; k<kmax; k++,ct1++) {
966: row = rbuf1_i[ct1] - rstart;
967: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
968: ncols = nzA + nzB;
969: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
971: /* load the column indices for this row into cols*/
972: cols = sbuf_aj_i + ct2;
973: for (l=0; l<nzB; l++) {
974: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
975: else break;
976: }
977: imark = l;
978: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
979: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
980: ct2 += ncols;
981: }
982: }
983: MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
984: }
985: }
986: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
987: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
989: /* Allocate buffers for a->a, and send them off */
990: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
991: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
992: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
993: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
994:
995: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
996: {
997: for (i=0; i<nrqr; i++) {
998: rbuf1_i = rbuf1[i];
999: sbuf_aa_i = sbuf_aa[i];
1000: ct1 = 2*rbuf1_i[0]+1;
1001: ct2 = 0;
1002: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1003: kmax = rbuf1_i[2*j];
1004: for (k=0; k<kmax; k++,ct1++) {
1005: row = rbuf1_i[ct1] - rstart;
1006: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1007: ncols = nzA + nzB;
1008: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1009: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
1011: /* load the column values for this row into vals*/
1012: vals = sbuf_aa_i+ct2*bs2;
1013: for (l=0; l<nzB; l++) {
1014: if ((bmap[cworkB[l]]) < cstart) {
1015: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1016: }
1017: else break;
1018: }
1019: imark = l;
1020: for (l=0; l<nzA; l++) {
1021: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
1022: }
1023: for (l=imark; l<nzB; l++) {
1024: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1025: }
1026: ct2 += ncols;
1027: }
1028: }
1029: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
1030: }
1031: }
1032: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1033: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1034: PetscFree(rbuf1);
1036: /* Form the matrix */
1037: /* create col map */
1038: {
1039: int *icol_i;
1040:
1041: len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
1042: ierr = PetscMalloc(len,&cmap);
1043: cmap[0] = (int *)(cmap + ismax);
1044: ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));
1045: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1046: for (i=0; i<ismax; i++) {
1047: jmax = ncol[i];
1048: icol_i = icol[i];
1049: cmap_i = cmap[i];
1050: for (j=0; j<jmax; j++) {
1051: cmap_i[icol_i[j]] = j+1;
1052: }
1053: }
1054: }
1055:
1057: /* Create lens which is required for MatCreate... */
1058: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1059: len = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1060: ierr = PetscMalloc(len,&lens);
1061: lens[0] = (int *)(lens + ismax);
1062: ierr = PetscMemzero(lens[0],j*sizeof(int));
1063: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1064:
1065: /* Update lens from local data */
1066: for (i=0; i<ismax; i++) {
1067: jmax = nrow[i];
1068: cmap_i = cmap[i];
1069: irow_i = irow[i];
1070: lens_i = lens[i];
1071: for (j=0; j<jmax; j++) {
1072: row = irow_i[j];
1073: proc = rtable[row];
1074: if (proc == rank) {
1075: /* Get indices from matA and then from matB */
1076: row = row - rstart;
1077: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1078: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1079: for (k=0; k<nzA; k++) {
1080: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++;}
1081: }
1082: for (k=0; k<nzB; k++) {
1083: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++;}
1084: }
1085: }
1086: }
1087: }
1088:
1089: /* Create row map*/
1090: len = (1+ismax)*sizeof(int*)+ ismax*c->Mbs*sizeof(int);
1091: ierr = PetscMalloc(len,&rmap);
1092: rmap[0] = (int *)(rmap + ismax);
1093: ierr = PetscMemzero(rmap[0],ismax*c->Mbs*sizeof(int));
1094: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->Mbs;}
1095: for (i=0; i<ismax; i++) {
1096: rmap_i = rmap[i];
1097: irow_i = irow[i];
1098: jmax = nrow[i];
1099: for (j=0; j<jmax; j++) {
1100: rmap_i[irow_i[j]] = j;
1101: }
1102: }
1103:
1104: /* Update lens from offproc data */
1105: {
1106: int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1108: for (tmp2=0; tmp2<nrqs; tmp2++) {
1109: ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1110: index = pa[i];
1111: sbuf1_i = sbuf1[index];
1112: jmax = sbuf1_i[0];
1113: ct1 = 2*jmax+1;
1114: ct2 = 0;
1115: rbuf2_i = rbuf2[i];
1116: rbuf3_i = rbuf3[i];
1117: for (j=1; j<=jmax; j++) {
1118: is_no = sbuf1_i[2*j-1];
1119: max1 = sbuf1_i[2*j];
1120: lens_i = lens[is_no];
1121: cmap_i = cmap[is_no];
1122: rmap_i = rmap[is_no];
1123: for (k=0; k<max1; k++,ct1++) {
1124: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1125: max2 = rbuf2_i[ct1];
1126: for (l=0; l<max2; l++,ct2++) {
1127: if (cmap_i[rbuf3_i[ct2]]) {
1128: lens_i[row]++;
1129: }
1130: }
1131: }
1132: }
1133: }
1134: }
1135: PetscFree(r_status3);
1136: PetscFree(r_waits3);
1137: MPI_Waitall(nrqr,s_waits3,s_status3);
1138: PetscFree(s_status3);
1139: PetscFree(s_waits3);
1141: /* Create the submatrices */
1142: if (scall == MAT_REUSE_MATRIX) {
1143: /*
1144: Assumes new rows are same length as the old rows, hence bug!
1145: */
1146: for (i=0; i<ismax; i++) {
1147: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1148: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1149: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1150: }
1151: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);
1152: if (flag == PETSC_FALSE) {
1153: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1154: }
1155: /* Initial matrix as if empty */
1156: PetscMemzero(mat->ilen,mat->mbs*sizeof(int));
1157: submats[i]->factor = C->factor;
1158: }
1159: } else {
1160: for (i=0; i<ismax; i++) {
1161: MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);
1162: }
1163: }
1165: /* Assemble the matrices */
1166: /* First assemble the local rows */
1167: {
1168: int ilen_row,*imat_ilen,*imat_j,*imat_i;
1169: MatScalar *imat_a;
1170:
1171: for (i=0; i<ismax; i++) {
1172: mat = (Mat_SeqBAIJ*)submats[i]->data;
1173: imat_ilen = mat->ilen;
1174: imat_j = mat->j;
1175: imat_i = mat->i;
1176: imat_a = mat->a;
1177: cmap_i = cmap[i];
1178: rmap_i = rmap[i];
1179: irow_i = irow[i];
1180: jmax = nrow[i];
1181: for (j=0; j<jmax; j++) {
1182: row = irow_i[j];
1183: proc = rtable[row];
1184: if (proc == rank) {
1185: row = row - rstart;
1186: nzA = a_i[row+1] - a_i[row];
1187: nzB = b_i[row+1] - b_i[row];
1188: cworkA = a_j + a_i[row];
1189: cworkB = b_j + b_i[row];
1190: vworkA = a_a + a_i[row]*bs2;
1191: vworkB = b_a + b_i[row]*bs2;
1193: row = rmap_i[row + rstart];
1194: mat_i = imat_i[row];
1195: mat_a = imat_a + mat_i*bs2;
1196: mat_j = imat_j + mat_i;
1197: ilen_row = imat_ilen[row];
1199: /* load the column indices for this row into cols*/
1200: for (l=0; l<nzB; l++) {
1201: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1202: if ((tcol = cmap_i[ctmp])) {
1203: *mat_j++ = tcol - 1;
1204: ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1205: mat_a += bs2;
1206: ilen_row++;
1207: }
1208: } else break;
1209: }
1210: imark = l;
1211: for (l=0; l<nzA; l++) {
1212: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1213: *mat_j++ = tcol - 1;
1214: ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1215: mat_a += bs2;
1216: ilen_row++;
1217: }
1218: }
1219: for (l=imark; l<nzB; l++) {
1220: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1221: *mat_j++ = tcol - 1;
1222: ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1223: mat_a += bs2;
1224: ilen_row++;
1225: }
1226: }
1227: imat_ilen[row] = ilen_row;
1228: }
1229: }
1230:
1231: }
1232: }
1234: /* Now assemble the off proc rows*/
1235: {
1236: int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1237: int *imat_j,*imat_i;
1238: MatScalar *imat_a,*rbuf4_i;
1240: for (tmp2=0; tmp2<nrqs; tmp2++) {
1241: ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1242: index = pa[i];
1243: sbuf1_i = sbuf1[index];
1244: jmax = sbuf1_i[0];
1245: ct1 = 2*jmax + 1;
1246: ct2 = 0;
1247: rbuf2_i = rbuf2[i];
1248: rbuf3_i = rbuf3[i];
1249: rbuf4_i = rbuf4[i];
1250: for (j=1; j<=jmax; j++) {
1251: is_no = sbuf1_i[2*j-1];
1252: rmap_i = rmap[is_no];
1253: cmap_i = cmap[is_no];
1254: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1255: imat_ilen = mat->ilen;
1256: imat_j = mat->j;
1257: imat_i = mat->i;
1258: imat_a = mat->a;
1259: max1 = sbuf1_i[2*j];
1260: for (k=0; k<max1; k++,ct1++) {
1261: row = sbuf1_i[ct1];
1262: row = rmap_i[row];
1263: ilen = imat_ilen[row];
1264: mat_i = imat_i[row];
1265: mat_a = imat_a + mat_i*bs2;
1266: mat_j = imat_j + mat_i;
1267: max2 = rbuf2_i[ct1];
1268: for (l=0; l<max2; l++,ct2++) {
1269: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1270: *mat_j++ = tcol - 1;
1271: /* *mat_a++= rbuf4_i[ct2]; */
1272: ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1273: mat_a += bs2;
1274: ilen++;
1275: }
1276: }
1277: imat_ilen[row] = ilen;
1278: }
1279: }
1280: }
1281: }
1282: PetscFree(r_status4);
1283: PetscFree(r_waits4);
1284: MPI_Waitall(nrqr,s_waits4,s_status4);
1285: PetscFree(s_waits4);
1286: PetscFree(s_status4);
1288: /* Restore the indices */
1289: for (i=0; i<ismax; i++) {
1290: ISRestoreIndices(isrow[i],irow+i);
1291: ISRestoreIndices(iscol[i],icol+i);
1292: }
1294: /* Destroy allocated memory */
1295: PetscFree(irow);
1296: PetscFree(w1);
1297: PetscFree(pa);
1299: PetscFree(sbuf1);
1300: PetscFree(rbuf2);
1301: for (i=0; i<nrqr; ++i) {
1302: PetscFree(sbuf2[i]);
1303: }
1304: for (i=0; i<nrqs; ++i) {
1305: PetscFree(rbuf3[i]);
1306: PetscFree(rbuf4[i]);
1307: }
1309: PetscFree(sbuf2);
1310: PetscFree(rbuf3);
1311: PetscFree(rbuf4);
1312: PetscFree(sbuf_aj[0]);
1313: PetscFree(sbuf_aj);
1314: PetscFree(sbuf_aa[0]);
1315: PetscFree(sbuf_aa);
1316:
1317: PetscFree(cmap);
1318: PetscFree(rmap);
1319: PetscFree(lens);
1321: for (i=0; i<ismax; i++) {
1322: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1323: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1324: }
1326: return(0);
1327: }