Actual source code: baijov.c
1: /*$Id: baijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *);
11: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**);
12: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*);
13: EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
14: EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
15:
16: static int MatCompressIndicesGeneral_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
17: {
18: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
19: int ierr,isz,bs = baij->bs,n,i,j,*idx,ival;
20: #if defined (PETSC_USE_CTABLE)
21: PetscTable gid1_lid1;
22: int tt, gid1, *nidx;
23: PetscTablePosition tpos;
24: #else
25: int Nbs,*nidx;
26: PetscBT table;
27: #endif
30: #if defined (PETSC_USE_CTABLE)
31: PetscTableCreate(baij->mbs,&gid1_lid1);
32: #else
33: Nbs = baij->Nbs;
34: PetscMalloc((Nbs+1)*sizeof(int),&nidx);
35: PetscBTCreate(Nbs,table);
36: #endif
37: for (i=0; i<imax; i++) {
38: isz = 0;
39: #if defined (PETSC_USE_CTABLE)
40: PetscTableRemoveAll(gid1_lid1);
41: #else
42: PetscBTMemzero(Nbs,table);
43: #endif
44: ISGetIndices(is_in[i],&idx);
45: ISGetLocalSize(is_in[i],&n);
46: for (j=0; j<n ; j++) {
47: ival = idx[j]/bs; /* convert the indices into block indices */
48: #if defined (PETSC_USE_CTABLE)
49: PetscTableFind(gid1_lid1,ival+1,&tt);
50: if (!tt) {
51: PetscTableAdd(gid1_lid1,ival+1,isz+1);
52: isz++;
53: }
54: #else
55: if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
56: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
57: #endif
58: }
59: ISRestoreIndices(is_in[i],&idx);
60: #if defined (PETSC_USE_CTABLE)
61: PetscMalloc((isz+1)*sizeof(int),&nidx);
62: PetscTableGetHeadPosition(gid1_lid1,&tpos);
63: j = 0;
64: while (tpos) {
65: PetscTableGetNext(gid1_lid1,&tpos,&gid1,&tt);
66: if (tt-- > isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than array-dim"); }
67: nidx[tt] = gid1 - 1;
68: j++;
69: }
70: if (j != isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"table error: jj != isz"); }
71: ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));
72: PetscFree(nidx);
73: #else
74: ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));
75: #endif
76: }
77: #if defined (PETSC_USE_CTABLE)
78: PetscTableDelete(gid1_lid1);
79: #else
80: PetscBTDestroy(table);
81: PetscFree(nidx);
82: #endif
83: return(0);
84: }
86: static int MatCompressIndicesSorted_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
87: {
88: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
89: int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,*idx_local;
90: PetscTruth flg;
91: #if defined (PETSC_USE_CTABLE)
92: int maxsz;
93: #else
94: int Nbs=baij->Nbs;
95: #endif
97: for (i=0; i<imax; i++) {
98: ISSorted(is_in[i],&flg);
99: if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Indices are not sorted");
100: }
101: #if defined (PETSC_USE_CTABLE)
102: /* Now check max size */
103: for (i=0,maxsz=0; i<imax; i++) {
104: ISGetIndices(is_in[i],&idx);
105: ISGetLocalSize(is_in[i],&n);
106: if (n%bs !=0) SETERRQ(1,"Indices are not block ordered");
107: n = n/bs; /* The reduced index size */
108: if (n > maxsz) maxsz = n;
109: }
110: PetscMalloc((maxsz+1)*sizeof(int),&nidx);
111: #else
112: PetscMalloc((Nbs+1)*sizeof(int),&nidx);
113: #endif
114: /* Now check if the indices are in block order */
115: for (i=0; i<imax; i++) {
116: ISGetIndices(is_in[i],&idx);
117: ISGetLocalSize(is_in[i],&n);
118: if (n%bs !=0) SETERRQ(1,"Indices are not block ordered");
120: n = n/bs; /* The reduced index size */
121: idx_local = idx;
122: for (j=0; j<n ; j++) {
123: val = idx_local[0];
124: if (val%bs != 0) SETERRQ(1,"Indices are not block ordered");
125: for (k=0; k<bs; k++) {
126: if (val+k != idx_local[k]) SETERRQ(1,"Indices are not block ordered");
127: }
128: nidx[j] = val/bs;
129: idx_local +=bs;
130: }
131: ISRestoreIndices(is_in[i],&idx);
132: ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));
133: }
134: PetscFree(nidx);
136: return(0);
137: }
139: static int MatExpandIndices_MPIBAIJ(Mat C,int imax,IS *is_in,IS *is_out)
140: {
141: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
142: int ierr,bs = baij->bs,n,i,j,k,*idx,*nidx;
143: #if defined (PETSC_USE_CTABLE)
144: int maxsz;
145: #else
146: int Nbs = baij->Nbs;
147: #endif
149:
150: #if defined (PETSC_USE_CTABLE)
151: /* Now check max size */
152: for (i=0,maxsz=0; i<imax; i++) {
153: ISGetIndices(is_in[i],&idx);
154: ISGetLocalSize(is_in[i],&n);
155: if (n*bs > maxsz) maxsz = n*bs;
156: }
157: PetscMalloc((maxsz+1)*sizeof(int),&nidx);
158: #else
159: PetscMalloc((Nbs*bs+1)*sizeof(int),&nidx);
160: #endif
162: for (i=0; i<imax; i++) {
163: ISGetIndices(is_in[i],&idx);
164: ISGetLocalSize(is_in[i],&n);
165: for (j=0; j<n ; ++j){
166: for (k=0; k<bs; k++)
167: nidx[j*bs+k] = idx[j]*bs+k;
168: }
169: ISRestoreIndices(is_in[i],&idx);
170: ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);
171: }
172: PetscFree(nidx);
173: return(0);
174: }
177: int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS *is,int ov)
178: {
179: int i,ierr;
180: IS *is_new;
183: PetscMalloc(imax*sizeof(IS),&is_new);
184: /* Convert the indices into block format */
185: MatCompressIndicesGeneral_MPIBAIJ(C,imax,is,is_new);
186: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specifiedn");}
187: for (i=0; i<ov; ++i) {
188: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
189: }
190: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
191: MatExpandIndices_MPIBAIJ(C,imax,is_new,is);
192: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
193: PetscFree(is_new);
194: return(0);
195: }
197: /*
198: Sample message format:
199: If a processor A wants processor B to process some elements corresponding
200: to index sets 1s[1], is[5]
201: mesg [0] = 2 (no of index sets in the mesg)
202: -----------
203: mesg [1] = 1 => is[1]
204: mesg [2] = sizeof(is[1]);
205: -----------
206: mesg [5] = 5 => is[5]
207: mesg [6] = sizeof(is[5]);
208: -----------
209: mesg [7]
210: mesg [n] datas[1]
211: -----------
212: mesg[n+1]
213: mesg[m] data(is[5])
214: -----------
215:
216: Notes:
217: nrqs - no of requests sent (or to be sent out)
218: nrqr - no of requests recieved (which have to be or which have been processed
219: */
220: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS *is)
221: {
222: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
223: int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
224: int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
225: int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
226: int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
227: PetscBT *table;
228: MPI_Comm comm;
229: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
230: MPI_Status *s_status,*recv_status;
233: comm = C->comm;
234: size = c->size;
235: rank = c->rank;
236: Mbs = c->Mbs;
238: PetscObjectGetNewTag((PetscObject)C,&tag1);
239: PetscObjectGetNewTag((PetscObject)C,&tag2);
241: len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
242: ierr = PetscMalloc(len,&idx);
243: n = (int*)(idx + imax);
244: rtable = n + imax;
245:
246: for (i=0; i<imax; i++) {
247: ISGetIndices(is[i],&idx[i]);
248: ISGetLocalSize(is[i],&n[i]);
249: }
250:
251: /* Create hash table for the mapping :row -> proc*/
252: for (i=0,j=0; i<size; i++) {
253: len = c->rowners[i+1];
254: for (; j<len; j++) {
255: rtable[j] = i;
256: }
257: }
259: /* evaluate communication - mesg to who,length of mesg, and buffer space
260: required. Based on this, buffers are allocated, and data copied into them*/
261: PetscMalloc(size*4*sizeof(int),&w1);/* mesg size */
262: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
263: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
264: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
265: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
266: for (i=0; i<imax; i++) {
267: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
268: idx_i = idx[i];
269: len = n[i];
270: for (j=0; j<len; j++) {
271: row = idx_i[j];
272: if (row < 0) {
273: SETERRQ(1,"Index set cannot have negative entries");
274: }
275: proc = rtable[row];
276: w4[proc]++;
277: }
278: for (j=0; j<size; j++){
279: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
280: }
281: }
283: nrqs = 0; /* no of outgoing messages */
284: msz = 0; /* total mesg length (for all proc */
285: w1[rank] = 0; /* no mesg sent to intself */
286: w3[rank] = 0;
287: for (i=0; i<size; i++) {
288: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
289: }
290: /* pa - is list of processors to communicate with */
291: PetscMalloc((nrqs+1)*sizeof(int),&pa);
292: for (i=0,j=0; i<size; i++) {
293: if (w1[i]) {pa[j] = i; j++;}
294: }
296: /* Each message would have a header = 1 + 2*(no of IS) + data */
297: for (i=0; i<nrqs; i++) {
298: j = pa[i];
299: w1[j] += w2[j] + 2*w3[j];
300: msz += w1[j];
301: }
302:
303: /* Determine the number of messages to expect, their lengths, from from-ids */
304: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
305: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
307: /* Now post the Irecvs corresponding to these messages */
308: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
309:
310: /* Allocate Memory for outgoing messages */
311: len = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
312: PetscMalloc(len,&outdat);
313: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
314: ierr = PetscMemzero(outdat,2*size*sizeof(int*));
315: tmp = (int*)(outdat + 2*size);
316: ctr = tmp + msz;
318: {
319: int *iptr = tmp,ict = 0;
320: for (i=0; i<nrqs; i++) {
321: j = pa[i];
322: iptr += ict;
323: outdat[j] = iptr;
324: ict = w1[j];
325: }
326: }
328: /* Form the outgoing messages */
329: /*plug in the headers*/
330: for (i=0; i<nrqs; i++) {
331: j = pa[i];
332: outdat[j][0] = 0;
333: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
334: ptr[j] = outdat[j] + 2*w3[j] + 1;
335: }
336:
337: /* Memory for doing local proc's work*/
338: {
339: int *d_p;
340: char *t_p;
342: len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
343: (Mbs)*imax*sizeof(int) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
344: PetscMalloc(len,&table);
345: PetscMemzero(table,len);
346: data = (int **)(table + imax);
347: isz = (int *)(data + imax);
348: d_p = (int *)(isz + imax);
349: t_p = (char *)(d_p + Mbs*imax);
350: for (i=0; i<imax; i++) {
351: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
352: data[i] = d_p + (Mbs)*i;
353: }
354: }
356: /* Parse the IS and update local tables and the outgoing buf with the data*/
357: {
358: int n_i,*data_i,isz_i,*outdat_j,ctr_j;
359: PetscBT table_i;
361: for (i=0; i<imax; i++) {
362: ierr = PetscMemzero(ctr,size*sizeof(int));
363: n_i = n[i];
364: table_i = table[i];
365: idx_i = idx[i];
366: data_i = data[i];
367: isz_i = isz[i];
368: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
369: row = idx_i[j];
370: proc = rtable[row];
371: if (proc != rank) { /* copy to the outgoing buffer */
372: ctr[proc]++;
373: *ptr[proc] = row;
374: ptr[proc]++;
375: }
376: else { /* Update the local table */
377: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
378: }
379: }
380: /* Update the headers for the current IS */
381: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
382: if ((ctr_j = ctr[j])) {
383: outdat_j = outdat[j];
384: k = ++outdat_j[0];
385: outdat_j[2*k] = ctr_j;
386: outdat_j[2*k-1] = i;
387: }
388: }
389: isz[i] = isz_i;
390: }
391: }
392:
393: /* Now post the sends */
394: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
395: for (i=0; i<nrqs; ++i) {
396: j = pa[i];
397: MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
398: }
399:
400: /* No longer need the original indices*/
401: for (i=0; i<imax; ++i) {
402: ISRestoreIndices(is[i],idx+i);
403: }
404: PetscFree(idx);
406: for (i=0; i<imax; ++i) {
407: ISDestroy(is[i]);
408: }
409:
410: /* Do Local work*/
411: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
413: /* Receive messages*/
414: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
415: MPI_Waitall(nrqr,r_waits1,recv_status);
416:
417: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
418: MPI_Waitall(nrqs,s_waits1,s_status);
420: /* Phase 1 sends are complete - deallocate buffers */
421: PetscFree(outdat);
422: PetscFree(w1);
424: PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
425: PetscMalloc((nrqr+1)*sizeof(int),&isz1);
426: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
427: PetscFree(rbuf);
429: /* Send the data back*/
430: /* Do a global reduction to know the buffer space req for incoming messages*/
431: {
432: int *rw1;
433:
434: PetscMalloc(size*sizeof(int),&rw1);
435: PetscMemzero(rw1,size*sizeof(int));
437: for (i=0; i<nrqr; ++i) {
438: proc = recv_status[i].MPI_SOURCE;
439: if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
440: rw1[proc] = isz1[i];
441: }
442:
443: PetscFree(onodes1);
444: PetscFree(olengths1);
446: /* Determine the number of messages to expect, their lengths, from from-ids */
447: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
448: PetscFree(rw1);
449: }
450: /* Now post the Irecvs corresponding to these messages */
451: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
452:
453: /* Now post the sends */
454: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
455: for (i=0; i<nrqr; ++i) {
456: j = recv_status[i].MPI_SOURCE;
457: MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
458: }
460: /* receive work done on other processors*/
461: {
462: int index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
463: PetscBT table_i;
464: MPI_Status *status2;
465:
466: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&status2);
468: for (i=0; i<nrqs; ++i) {
469: MPI_Waitany(nrqs,r_waits2,&index,status2+i);
470: /* Process the message*/
471: rbuf2_i = rbuf2[index];
472: ct1 = 2*rbuf2_i[0]+1;
473: jmax = rbuf2[index][0];
474: for (j=1; j<=jmax; j++) {
475: max = rbuf2_i[2*j];
476: is_no = rbuf2_i[2*j-1];
477: isz_i = isz[is_no];
478: data_i = data[is_no];
479: table_i = table[is_no];
480: for (k=0; k<max; k++,ct1++) {
481: row = rbuf2_i[ct1];
482: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
483: }
484: isz[is_no] = isz_i;
485: }
486: }
487: MPI_Waitall(nrqr,s_waits2,status2);
488: PetscFree(status2);
489: }
490:
491: for (i=0; i<imax; ++i) {
492: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
493: }
494:
495:
496: PetscFree(onodes2);
497: PetscFree(olengths2);
499: PetscFree(pa);
500: PetscFree(rbuf2);
501: PetscFree(s_waits1);
502: PetscFree(r_waits1);
503: PetscFree(s_waits2);
504: PetscFree(r_waits2);
505: PetscFree(table);
506: PetscFree(s_status);
507: PetscFree(recv_status);
508: PetscFree(xdata[0]);
509: PetscFree(xdata);
510: PetscFree(isz1);
511: return(0);
512: }
514: /*
515: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
516: the work on the local processor.
518: Inputs:
519: C - MAT_MPIBAIJ;
520: imax - total no of index sets processed at a time;
521: table - an array of char - size = Mbs bits.
522:
523: Output:
524: isz - array containing the count of the solution elements correspondign
525: to each index set;
526: data - pointer to the solutions
527: */
528: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
529: {
530: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
531: Mat A = c->A,B = c->B;
532: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
533: int start,end,val,max,rstart,cstart,*ai,*aj;
534: int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
535: PetscBT table_i;
538: rstart = c->rstart;
539: cstart = c->cstart;
540: ai = a->i;
541: aj = a->j;
542: bi = b->i;
543: bj = b->j;
544: garray = c->garray;
546:
547: for (i=0; i<imax; i++) {
548: data_i = data[i];
549: table_i = table[i];
550: isz_i = isz[i];
551: for (j=0,max=isz[i]; j<max; j++) {
552: row = data_i[j] - rstart;
553: start = ai[row];
554: end = ai[row+1];
555: for (k=start; k<end; k++) { /* Amat */
556: val = aj[k] + cstart;
557: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
558: }
559: start = bi[row];
560: end = bi[row+1];
561: for (k=start; k<end; k++) { /* Bmat */
562: val = garray[bj[k]];
563: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
564: }
565: }
566: isz[i] = isz_i;
567: }
568: return(0);
569: }
570: /*
571: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
572: and return the output
574: Input:
575: C - the matrix
576: nrqr - no of messages being processed.
577: rbuf - an array of pointers to the recieved requests
578:
579: Output:
580: xdata - array of messages to be sent back
581: isz1 - size of each message
583: For better efficiency perhaps we should malloc seperately each xdata[i],
584: then if a remalloc is required we need only copy the data for that one row
585: rather then all previous rows as it is now where a single large chunck of
586: memory is used.
588: */
589: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
590: {
591: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
592: Mat A = c->A,B = c->B;
593: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
594: int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
595: int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
596: int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
597: int *rbuf_i,kmax,rbuf_0,ierr;
598: PetscBT xtable;
601: rank = c->rank;
602: Mbs = c->Mbs;
603: rstart = c->rstart;
604: cstart = c->cstart;
605: ai = a->i;
606: aj = a->j;
607: bi = b->i;
608: bj = b->j;
609: garray = c->garray;
610:
611:
612: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
613: rbuf_i = rbuf[i];
614: rbuf_0 = rbuf_i[0];
615: ct += rbuf_0;
616: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
617: }
618:
619: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
620: else max1 = 1;
621: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
622: ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
623: ++no_malloc;
624: ierr = PetscBTCreate(Mbs,xtable);
625: ierr = PetscMemzero(isz1,nrqr*sizeof(int));
626:
627: ct3 = 0;
628: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
629: rbuf_i = rbuf[i];
630: rbuf_0 = rbuf_i[0];
631: ct1 = 2*rbuf_0+1;
632: ct2 = ct1;
633: ct3 += ct1;
634: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
635: PetscBTMemzero(Mbs,xtable);
636: oct2 = ct2;
637: kmax = rbuf_i[2*j];
638: for (k=0; k<kmax; k++,ct1++) {
639: row = rbuf_i[ct1];
640: if (!PetscBTLookupSet(xtable,row)) {
641: if (!(ct3 < mem_estimate)) {
642: new_estimate = (int)(1.5*mem_estimate)+1;
643: PetscMalloc(new_estimate * sizeof(int),&tmp);
644: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
645: PetscFree(xdata[0]);
646: xdata[0] = tmp;
647: mem_estimate = new_estimate; ++no_malloc;
648: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
649: }
650: xdata[i][ct2++] = row;
651: ct3++;
652: }
653: }
654: for (k=oct2,max2=ct2; k<max2; k++) {
655: row = xdata[i][k] - rstart;
656: start = ai[row];
657: end = ai[row+1];
658: for (l=start; l<end; l++) {
659: val = aj[l] + cstart;
660: if (!PetscBTLookupSet(xtable,val)) {
661: if (!(ct3 < mem_estimate)) {
662: new_estimate = (int)(1.5*mem_estimate)+1;
663: PetscMalloc(new_estimate * sizeof(int),&tmp);
664: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
665: PetscFree(xdata[0]);
666: xdata[0] = tmp;
667: mem_estimate = new_estimate; ++no_malloc;
668: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
669: }
670: xdata[i][ct2++] = val;
671: ct3++;
672: }
673: }
674: start = bi[row];
675: end = bi[row+1];
676: for (l=start; l<end; l++) {
677: val = garray[bj[l]];
678: if (!PetscBTLookupSet(xtable,val)) {
679: if (!(ct3 < mem_estimate)) {
680: new_estimate = (int)(1.5*mem_estimate)+1;
681: PetscMalloc(new_estimate * sizeof(int),&tmp);
682: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
683: PetscFree(xdata[0]);
684: xdata[0] = tmp;
685: mem_estimate = new_estimate; ++no_malloc;
686: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
687: }
688: xdata[i][ct2++] = val;
689: ct3++;
690: }
691: }
692: }
693: /* Update the header*/
694: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
695: xdata[i][2*j-1] = rbuf_i[2*j-1];
696: }
697: xdata[i][0] = rbuf_0;
698: xdata[i+1] = xdata[i] + ct2;
699: isz1[i] = ct2; /* size of each message */
700: }
701: PetscBTDestroy(xtable);
702: PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %dn",rank,mem_estimate,ct3,no_malloc);
703: return(0);
704: }
706: static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,IS *,IS *,MatReuse,Mat *);
708: int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat)
709: {
710: IS *isrow_new,*iscol_new;
711: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
712: int nmax,nstages_local,nstages,i,pos,max_no,ierr;
715: /* The compression and expansion should be avoided. Does'nt point
716: out errors might change the indices hence buggey */
718: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
719: iscol_new = isrow_new + ismax;
720: MatCompressIndicesSorted_MPIBAIJ(C,ismax,isrow,isrow_new);
721: MatCompressIndicesSorted_MPIBAIJ(C,ismax,iscol,iscol_new);
723: /* Allocate memory to hold all the submatrices */
724: if (scall != MAT_REUSE_MATRIX) {
725: PetscMalloc((ismax+1)*sizeof(Mat),submat);
726: }
727: /* Determine the number of stages through which submatrices are done */
728: nmax = 20*1000000 / (c->Nbs * sizeof(int));
729: if (!nmax) nmax = 1;
730: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
731:
732: /* Make sure every porcessor loops through the nstages */
733: MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
734: for (i=0,pos=0; i<nstages; i++) {
735: if (pos+nmax <= ismax) max_no = nmax;
736: else if (pos == ismax) max_no = 0;
737: else max_no = ismax-pos;
738: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
739: pos += max_no;
740: }
741:
742: for (i=0; i<ismax; i++) {
743: ISDestroy(isrow_new[i]);
744: ISDestroy(iscol_new[i]);
745: }
746: PetscFree(isrow_new);
747: return(0);
748: }
750: #if defined (PETSC_USE_CTABLE)
751: int PetscGetProc(const int gid, const int numprocs, const int proc_gnode[], int *proc)
752: {
753: int nGlobalNd = proc_gnode[numprocs];
754: int fproc = (int) ((float)gid * (float)numprocs / (float)nGlobalNd + 0.5);
755:
757: /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/
758: if (fproc > numprocs) fproc = numprocs;
759: while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) {
760: if (gid < proc_gnode[fproc]) fproc--;
761: else fproc++;
762: }
763: /* if(fproc<0 || fproc>=numprocs) { SETERRQ(1,"fproc < 0 || fproc >= numprocs"); }*/
764: *proc = fproc;
765: return(0);
766: }
767: #endif
769: /* -------------------------------------------------------------------------*/
770: static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats)
771: {
772: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
773: Mat A = c->A;
774: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
775: int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size;
776: int **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
777: int nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
778: int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
779: int **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
780: int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
781: int bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
782: int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
783: int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
784: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
785: MPI_Request *r_waits4,*s_waits3,*s_waits4;
786: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
787: MPI_Status *r_status3,*r_status4,*s_status4;
788: MPI_Comm comm;
789: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
790: MatScalar *a_a=a->a,*b_a=b->a;
791: PetscTruth flag;
792: int *onodes1,*olengths1;
794: #if defined (PETSC_USE_CTABLE)
795: int tt;
796: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
797: #else
798: int **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
799: #endif
802: comm = C->comm;
803: tag0 = C->tag;
804: size = c->size;
805: rank = c->rank;
806:
807: /* Get some new tags to keep the communication clean */
808: PetscObjectGetNewTag((PetscObject)C,&tag1);
809: PetscObjectGetNewTag((PetscObject)C,&tag2);
810: PetscObjectGetNewTag((PetscObject)C,&tag3);
812: /* Check if the col indices are sorted */
813: for (i=0; i<ismax; i++) {
814: ISSorted(iscol[i],(PetscTruth*)&j);
815: if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
816: }
818: len = (2*ismax+1)*(sizeof(int*)+ sizeof(int));
819: #if !defined (PETSC_USE_CTABLE)
820: len += (Mbs+1)*sizeof(int);
821: #endif
822: PetscMalloc(len,&irow);
823: icol = irow + ismax;
824: nrow = (int*)(icol + ismax);
825: ncol = nrow + ismax;
826: #if !defined (PETSC_USE_CTABLE)
827: rtable = ncol + ismax;
828: /* Create hash table for the mapping :row -> proc*/
829: for (i=0,j=0; i<size; i++) {
830: jmax = c->rowners[i+1];
831: for (; j<jmax; j++) {
832: rtable[j] = i;
833: }
834: }
835: #endif
836:
837: for (i=0; i<ismax; i++) {
838: ISGetIndices(isrow[i],&irow[i]);
839: ISGetIndices(iscol[i],&icol[i]);
840: ISGetLocalSize(isrow[i],&nrow[i]);
841: ISGetLocalSize(iscol[i],&ncol[i]);
842: }
844: /* evaluate communication - mesg to who,length of mesg,and buffer space
845: required. Based on this, buffers are allocated, and data copied into them*/
846: PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
847: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
848: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
849: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
850: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
851: for (i=0; i<ismax; i++) {
852: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
853: jmax = nrow[i];
854: irow_i = irow[i];
855: for (j=0; j<jmax; j++) {
856: row = irow_i[j];
857: #if defined (PETSC_USE_CTABLE)
858: PetscGetProc(row,size,c->rowners,&proc);
859: #else
860: proc = rtable[row];
861: #endif
862: w4[proc]++;
863: }
864: for (j=0; j<size; j++) {
865: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
866: }
867: }
869: nrqs = 0; /* no of outgoing messages */
870: msz = 0; /* total mesg length for all proc */
871: w1[rank] = 0; /* no mesg sent to intself */
872: w3[rank] = 0;
873: for (i=0; i<size; i++) {
874: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
875: }
876: PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
877: for (i=0,j=0; i<size; i++) {
878: if (w1[i]) { pa[j] = i; j++; }
879: }
881: /* Each message would have a header = 1 + 2*(no of IS) + data */
882: for (i=0; i<nrqs; i++) {
883: j = pa[i];
884: w1[j] += w2[j] + 2* w3[j];
885: msz += w1[j];
886: }
888: /* Determine the number of messages to expect, their lengths, from from-ids */
889: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
890: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
892: /* Now post the Irecvs corresponding to these messages */
893: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
894:
895: PetscFree(onodes1);
896: PetscFree(olengths1);
898: /* Allocate Memory for outgoing messages */
899: len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
900: PetscMalloc(len,&sbuf1);
901: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
902: PetscMemzero(sbuf1,2*size*sizeof(int*));
903: /* allocate memory for outgoing data + buf to receive the first reply */
904: tmp = (int*)(ptr + size);
905: ctr = tmp + 2*msz;
907: {
908: int *iptr = tmp,ict = 0;
909: for (i=0; i<nrqs; i++) {
910: j = pa[i];
911: iptr += ict;
912: sbuf1[j] = iptr;
913: ict = w1[j];
914: }
915: }
917: /* Form the outgoing messages */
918: /* Initialise the header space */
919: for (i=0; i<nrqs; i++) {
920: j = pa[i];
921: sbuf1[j][0] = 0;
922: ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
923: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
924: }
925:
926: /* Parse the isrow and copy data into outbuf */
927: for (i=0; i<ismax; i++) {
928: ierr = PetscMemzero(ctr,size*sizeof(int));
929: irow_i = irow[i];
930: jmax = nrow[i];
931: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
932: row = irow_i[j];
933: #if defined (PETSC_USE_CTABLE)
934: PetscGetProc(row,size,c->rowners,&proc);
935: #else
936: proc = rtable[row];
937: #endif
938: if (proc != rank) { /* copy to the outgoing buf*/
939: ctr[proc]++;
940: *ptr[proc] = row;
941: ptr[proc]++;
942: }
943: }
944: /* Update the headers for the current IS */
945: for (j=0; j<size; j++) { /* Can Optimise this loop too */
946: if ((ctr_j = ctr[j])) {
947: sbuf1_j = sbuf1[j];
948: k = ++sbuf1_j[0];
949: sbuf1_j[2*k] = ctr_j;
950: sbuf1_j[2*k-1] = i;
951: }
952: }
953: }
955: /* Now post the sends */
956: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
957: for (i=0; i<nrqs; ++i) {
958: j = pa[i];
959: MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
960: }
962: /* Post Recieves to capture the buffer size */
963: ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
964: ierr = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
965: rbuf2[0] = tmp + msz;
966: for (i=1; i<nrqs; ++i) {
967: j = pa[i];
968: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
969: }
970: for (i=0; i<nrqs; ++i) {
971: j = pa[i];
972: MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
973: }
975: /* Send to other procs the buf size they should allocate */
977: /* Receive messages*/
978: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
979: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
980: len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
981: ierr = PetscMalloc(len,&sbuf2);
982: req_size = (int*)(sbuf2 + nrqr);
983: req_source = req_size + nrqr;
984:
985: {
986: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
987: int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
989: for (i=0; i<nrqr; ++i) {
990: MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);
991: req_size[index] = 0;
992: rbuf1_i = rbuf1[index];
993: start = 2*rbuf1_i[0] + 1;
994: ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);
995: ierr = PetscMalloc(end*sizeof(int),&sbuf2[index]);
996: sbuf2_i = sbuf2[index];
997: for (j=start; j<end; j++) {
998: id = rbuf1_i[j] - rstart;
999: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1000: sbuf2_i[j] = ncols;
1001: req_size[index] += ncols;
1002: }
1003: req_source[index] = r_status1[i].MPI_SOURCE;
1004: /* form the header */
1005: sbuf2_i[0] = req_size[index];
1006: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1007: MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);
1008: }
1009: }
1010: PetscFree(r_status1);
1011: PetscFree(r_waits1);
1013: /* recv buffer sizes */
1014: /* Receive messages*/
1016: PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
1017: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
1018: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1019: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1020: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1022: for (i=0; i<nrqs; ++i) {
1023: MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);
1024: PetscMalloc(rbuf2[index][0]*sizeof(int),&rbuf3[index]);
1025: PetscMalloc(rbuf2[index][0]*bs2*sizeof(MatScalar),&rbuf4[index]);
1026: MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT,
1027: r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);
1028: MPI_Irecv(rbuf4[index],rbuf2[index][0]*bs2,MPIU_MATSCALAR,
1029: r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);
1030: }
1031: PetscFree(r_status2);
1032: PetscFree(r_waits2);
1033:
1034: /* Wait on sends1 and sends2 */
1035: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1036: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1038: MPI_Waitall(nrqs,s_waits1,s_status1);
1039: MPI_Waitall(nrqr,s_waits2,s_status2);
1040: PetscFree(s_status1);
1041: PetscFree(s_status2);
1042: PetscFree(s_waits1);
1043: PetscFree(s_waits2);
1045: /* Now allocate buffers for a->j, and send them off */
1046: PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);
1047: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1048: PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
1049: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1050:
1051: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1052: {
1053: for (i=0; i<nrqr; i++) {
1054: rbuf1_i = rbuf1[i];
1055: sbuf_aj_i = sbuf_aj[i];
1056: ct1 = 2*rbuf1_i[0] + 1;
1057: ct2 = 0;
1058: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1059: kmax = rbuf1[i][2*j];
1060: for (k=0; k<kmax; k++,ct1++) {
1061: row = rbuf1_i[ct1] - rstart;
1062: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1063: ncols = nzA + nzB;
1064: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1066: /* load the column indices for this row into cols*/
1067: cols = sbuf_aj_i + ct2;
1068: for (l=0; l<nzB; l++) {
1069: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
1070: else break;
1071: }
1072: imark = l;
1073: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
1074: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1075: ct2 += ncols;
1076: }
1077: }
1078: MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
1079: }
1080: }
1081: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1082: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1084: /* Allocate buffers for a->a, and send them off */
1085: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
1086: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1087: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
1088: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1089:
1090: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1091: {
1092: for (i=0; i<nrqr; i++) {
1093: rbuf1_i = rbuf1[i];
1094: sbuf_aa_i = sbuf_aa[i];
1095: ct1 = 2*rbuf1_i[0]+1;
1096: ct2 = 0;
1097: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1098: kmax = rbuf1_i[2*j];
1099: for (k=0; k<kmax; k++,ct1++) {
1100: row = rbuf1_i[ct1] - rstart;
1101: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1102: ncols = nzA + nzB;
1103: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1104: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
1106: /* load the column values for this row into vals*/
1107: vals = sbuf_aa_i+ct2*bs2;
1108: for (l=0; l<nzB; l++) {
1109: if ((bmap[cworkB[l]]) < cstart) {
1110: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1111: }
1112: else break;
1113: }
1114: imark = l;
1115: for (l=0; l<nzA; l++) {
1116: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
1117: }
1118: for (l=imark; l<nzB; l++) {
1119: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1120: }
1121: ct2 += ncols;
1122: }
1123: }
1124: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
1125: }
1126: }
1127: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1128: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1129: PetscFree(rbuf1);
1131: /* Form the matrix */
1132: /* create col map */
1133: {
1134: int *icol_i;
1135: #if defined (PETSC_USE_CTABLE)
1136: /* Create row map*/
1137: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
1138: for (i=0; i<ismax+1; i++) {
1139: PetscTableCreate(((i<ismax) ? ncol[i] : ncol[i-1])+1,&colmaps[i]);
1140: }
1141: #else
1142: len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
1143: ierr = PetscMalloc(len,&cmap);
1144: cmap[0] = (int *)(cmap + ismax);
1145: ierr = PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));
1146: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1147: #endif
1148: for (i=0; i<ismax; i++) {
1149: jmax = ncol[i];
1150: icol_i = icol[i];
1151: #if defined (PETSC_USE_CTABLE)
1152: lcol1_gcol1 = colmaps[i];
1153: for (j=0; j<jmax; j++) {
1154: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1155: }
1156: #else
1157: cmap_i = cmap[i];
1158: for (j=0; j<jmax; j++) {
1159: cmap_i[icol_i[j]] = j+1;
1160: }
1161: #endif
1162: }
1163: }
1165: /* Create lens which is required for MatCreate... */
1166: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1167: len = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1168: ierr = PetscMalloc(len,&lens);
1169: lens[0] = (int *)(lens + ismax);
1170: ierr = PetscMemzero(lens[0],j*sizeof(int));
1171: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1172:
1173: /* Update lens from local data */
1174: for (i=0; i<ismax; i++) {
1175: jmax = nrow[i];
1176: #if defined (PETSC_USE_CTABLE)
1177: lcol1_gcol1 = colmaps[i];
1178: #else
1179: cmap_i = cmap[i];
1180: #endif
1181: irow_i = irow[i];
1182: lens_i = lens[i];
1183: for (j=0; j<jmax; j++) {
1184: row = irow_i[j];
1185: #if defined (PETSC_USE_CTABLE)
1186: PetscGetProc(row,size,c->rowners,&proc);
1187: #else
1188: proc = rtable[row];
1189: #endif
1190: if (proc == rank) {
1191: /* Get indices from matA and then from matB */
1192: row = row - rstart;
1193: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1194: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1195: #if defined (PETSC_USE_CTABLE)
1196: for (k=0; k<nzA; k++) {
1197: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1198: if (tt) { lens_i[j]++; }
1199: }
1200: for (k=0; k<nzB; k++) {
1201: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1202: if (tt) { lens_i[j]++; }
1203: }
1204: #else
1205: for (k=0; k<nzA; k++) {
1206: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1207: }
1208: for (k=0; k<nzB; k++) {
1209: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1210: }
1211: #endif
1212: }
1213: }
1214: }
1215: #if defined (PETSC_USE_CTABLE)
1216: /* Create row map*/
1217: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1218: for (i=0; i<ismax+1; i++){
1219: PetscTableCreate((i<ismax) ? nrow[i] : nrow[i-1],&rowmaps[i]);
1220: }
1221: #else
1222: /* Create row map*/
1223: len = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int);
1224: ierr = PetscMalloc(len,&rmap);
1225: rmap[0] = (int *)(rmap + ismax);
1226: ierr = PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));
1227: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1228: #endif
1229: for (i=0; i<ismax; i++) {
1230: irow_i = irow[i];
1231: jmax = nrow[i];
1232: #if defined (PETSC_USE_CTABLE)
1233: lrow1_grow1 = rowmaps[i];
1234: for (j=0; j<jmax; j++) {
1235: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1236: }
1237: #else
1238: rmap_i = rmap[i];
1239: for (j=0; j<jmax; j++) {
1240: rmap_i[irow_i[j]] = j;
1241: }
1242: #endif
1243: }
1245: /* Update lens from offproc data */
1246: {
1247: int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1249: for (tmp2=0; tmp2<nrqs; tmp2++) {
1250: ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1251: index = pa[i];
1252: sbuf1_i = sbuf1[index];
1253: jmax = sbuf1_i[0];
1254: ct1 = 2*jmax+1;
1255: ct2 = 0;
1256: rbuf2_i = rbuf2[i];
1257: rbuf3_i = rbuf3[i];
1258: for (j=1; j<=jmax; j++) {
1259: is_no = sbuf1_i[2*j-1];
1260: max1 = sbuf1_i[2*j];
1261: lens_i = lens[is_no];
1262: #if defined (PETSC_USE_CTABLE)
1263: lcol1_gcol1 = colmaps[is_no];
1264: lrow1_grow1 = rowmaps[is_no];
1265: #else
1266: cmap_i = cmap[is_no];
1267: rmap_i = rmap[is_no];
1268: #endif
1269: for (k=0; k<max1; k++,ct1++) {
1270: #if defined (PETSC_USE_CTABLE)
1271: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1272: row--;
1273: if(row < 0) { SETERRQ(1,"row not found in table"); }
1274: #else
1275: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1276: #endif
1277: max2 = rbuf2_i[ct1];
1278: for (l=0; l<max2; l++,ct2++) {
1279: #if defined (PETSC_USE_CTABLE)
1280: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1281: if (tt) {
1282: lens_i[row]++;
1283: }
1284: #else
1285: if (cmap_i[rbuf3_i[ct2]]) {
1286: lens_i[row]++;
1287: }
1288: #endif
1289: }
1290: }
1291: }
1292: }
1293: }
1294: PetscFree(r_status3);
1295: PetscFree(r_waits3);
1296: MPI_Waitall(nrqr,s_waits3,s_status3);
1297: PetscFree(s_status3);
1298: PetscFree(s_waits3);
1300: /* Create the submatrices */
1301: if (scall == MAT_REUSE_MATRIX) {
1302: /*
1303: Assumes new rows are same length as the old rows, hence bug!
1304: */
1305: for (i=0; i<ismax; i++) {
1306: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1307: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1308: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1309: }
1310: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);
1311: if (flag == PETSC_FALSE) {
1312: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1313: }
1314: /* Initial matrix as if empty */
1315: PetscMemzero(mat->ilen,mat->mbs*sizeof(int));
1316: submats[i]->factor = C->factor;
1317: }
1318: } else {
1319: for (i=0; i<ismax; i++) {
1320: MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);
1321: }
1322: }
1324: /* Assemble the matrices */
1325: /* First assemble the local rows */
1326: {
1327: int ilen_row,*imat_ilen,*imat_j,*imat_i;
1328: MatScalar *imat_a;
1329:
1330: for (i=0; i<ismax; i++) {
1331: mat = (Mat_SeqBAIJ*)submats[i]->data;
1332: imat_ilen = mat->ilen;
1333: imat_j = mat->j;
1334: imat_i = mat->i;
1335: imat_a = mat->a;
1337: #if defined (PETSC_USE_CTABLE)
1338: lcol1_gcol1 = colmaps[i];
1339: lrow1_grow1 = rowmaps[i];
1340: #else
1341: cmap_i = cmap[i];
1342: rmap_i = rmap[i];
1343: #endif
1344: irow_i = irow[i];
1345: jmax = nrow[i];
1346: for (j=0; j<jmax; j++) {
1347: row = irow_i[j];
1348: #if defined (PETSC_USE_CTABLE)
1349: PetscGetProc(row,size,c->rowners,&proc);
1350: #else
1351: proc = rtable[row];
1352: #endif
1353: if (proc == rank) {
1354: row = row - rstart;
1355: nzA = a_i[row+1] - a_i[row];
1356: nzB = b_i[row+1] - b_i[row];
1357: cworkA = a_j + a_i[row];
1358: cworkB = b_j + b_i[row];
1359: vworkA = a_a + a_i[row]*bs2;
1360: vworkB = b_a + b_i[row]*bs2;
1361: #if defined (PETSC_USE_CTABLE)
1362: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1363: row--;
1364: if (row < 0) { SETERRQ(1,"row not found in table"); }
1365: #else
1366: row = rmap_i[row + rstart];
1367: #endif
1368: mat_i = imat_i[row];
1369: mat_a = imat_a + mat_i*bs2;
1370: mat_j = imat_j + mat_i;
1371: ilen_row = imat_ilen[row];
1373: /* load the column indices for this row into cols*/
1374: for (l=0; l<nzB; l++) {
1375: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1376: #if defined (PETSC_USE_CTABLE)
1377: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1378: if (tcol) {
1379: #else
1380: if ((tcol = cmap_i[ctmp])) {
1381: #endif
1382: *mat_j++ = tcol - 1;
1383: ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1384: mat_a += bs2;
1385: ilen_row++;
1386: }
1387: } else break;
1388: }
1389: imark = l;
1390: for (l=0; l<nzA; l++) {
1391: #if defined (PETSC_USE_CTABLE)
1392: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1393: if (tcol) {
1394: #else
1395: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1396: #endif
1397: *mat_j++ = tcol - 1;
1398: ierr = PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1399: mat_a += bs2;
1400: ilen_row++;
1401: }
1402: }
1403: for (l=imark; l<nzB; l++) {
1404: #if defined (PETSC_USE_CTABLE)
1405: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1406: if (tcol) {
1407: #else
1408: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1409: #endif
1410: *mat_j++ = tcol - 1;
1411: ierr = PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1412: mat_a += bs2;
1413: ilen_row++;
1414: }
1415: }
1416: imat_ilen[row] = ilen_row;
1417: }
1418: }
1419:
1420: }
1421: }
1423: /* Now assemble the off proc rows*/
1424: {
1425: int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1426: int *imat_j,*imat_i;
1427: MatScalar *imat_a,*rbuf4_i;
1429: for (tmp2=0; tmp2<nrqs; tmp2++) {
1430: ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1431: index = pa[i];
1432: sbuf1_i = sbuf1[index];
1433: jmax = sbuf1_i[0];
1434: ct1 = 2*jmax + 1;
1435: ct2 = 0;
1436: rbuf2_i = rbuf2[i];
1437: rbuf3_i = rbuf3[i];
1438: rbuf4_i = rbuf4[i];
1439: for (j=1; j<=jmax; j++) {
1440: is_no = sbuf1_i[2*j-1];
1441: #if defined (PETSC_USE_CTABLE)
1442: lrow1_grow1 = rowmaps[is_no];
1443: lcol1_gcol1 = colmaps[is_no];
1444: #else
1445: rmap_i = rmap[is_no];
1446: cmap_i = cmap[is_no];
1447: #endif
1448: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1449: imat_ilen = mat->ilen;
1450: imat_j = mat->j;
1451: imat_i = mat->i;
1452: imat_a = mat->a;
1453: max1 = sbuf1_i[2*j];
1454: for (k=0; k<max1; k++,ct1++) {
1455: row = sbuf1_i[ct1];
1456: #if defined (PETSC_USE_CTABLE)
1457: PetscTableFind(lrow1_grow1,row+1,&row);
1458: row--;
1459: if(row < 0) { SETERRQ(1,"row not found in table"); }
1460: #else
1461: row = rmap_i[row];
1462: #endif
1463: ilen = imat_ilen[row];
1464: mat_i = imat_i[row];
1465: mat_a = imat_a + mat_i*bs2;
1466: mat_j = imat_j + mat_i;
1467: max2 = rbuf2_i[ct1];
1468: for (l=0; l<max2; l++,ct2++) {
1469: #if defined (PETSC_USE_CTABLE)
1470: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1471: if (tcol) {
1472: #else
1473: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1474: #endif
1475: *mat_j++ = tcol - 1;
1476: /* *mat_a++= rbuf4_i[ct2]; */
1477: ierr = PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1478: mat_a += bs2;
1479: ilen++;
1480: }
1481: }
1482: imat_ilen[row] = ilen;
1483: }
1484: }
1485: }
1486: }
1487: PetscFree(r_status4);
1488: PetscFree(r_waits4);
1489: MPI_Waitall(nrqr,s_waits4,s_status4);
1490: PetscFree(s_waits4);
1491: PetscFree(s_status4);
1493: /* Restore the indices */
1494: for (i=0; i<ismax; i++) {
1495: ISRestoreIndices(isrow[i],irow+i);
1496: ISRestoreIndices(iscol[i],icol+i);
1497: }
1499: /* Destroy allocated memory */
1500: PetscFree(irow);
1501: PetscFree(w1);
1502: PetscFree(pa);
1504: PetscFree(sbuf1);
1505: PetscFree(rbuf2);
1506: for (i=0; i<nrqr; ++i) {
1507: PetscFree(sbuf2[i]);
1508: }
1509: for (i=0; i<nrqs; ++i) {
1510: PetscFree(rbuf3[i]);
1511: PetscFree(rbuf4[i]);
1512: }
1514: PetscFree(sbuf2);
1515: PetscFree(rbuf3);
1516: PetscFree(rbuf4);
1517: PetscFree(sbuf_aj[0]);
1518: PetscFree(sbuf_aj);
1519: PetscFree(sbuf_aa[0]);
1520: PetscFree(sbuf_aa);
1522: #if defined (PETSC_USE_CTABLE)
1523: for (i=0; i<ismax+1; i++){
1524: PetscTableDelete(rowmaps[i]);
1525: PetscTableDelete(colmaps[i]);
1526: }
1527: PetscFree(colmaps);
1528: PetscFree(rowmaps);
1529: /* Mark Adams */
1530: #else
1531: PetscFree(rmap);
1532: PetscFree(cmap);
1533: #endif
1534: PetscFree(lens);
1536: for (i=0; i<ismax; i++) {
1537: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1538: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1539: }
1541: return(0);
1542: }