Actual source code: aijnode.c

  1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
  2: /*
  3:   This file provides high performance routines for the AIJ (compressed row)
  4:   format by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
 6:  #include src/mat/impls/aij/seq/aij.h
 7:  #include src/vec/vecimpl.h

  9: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
 10: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
 11: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);

 13: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
 14: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
 15: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
 16: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
 17: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
 18: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
 19: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
 20: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);

 22: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
 23: {
 24:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 25:   int        ierr,i,count,m,n,min_mn,*ns_row,*ns_col;

 28:   n      = A->n;
 29:   m      = A->m;
 30:   ns_row = a->inode.size;
 31: 
 32:   min_mn = (m < n) ? m : n;
 33:   if (!ns) {
 34:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 35:     for(; count+1 < n; count++,i++);
 36:     if (count < n)  {
 37:       i++;
 38:     }
 39:     *size = i;
 40:     return(0);
 41:   }
 42:   PetscMalloc((n+1)*sizeof(int),&ns_col);
 43: 
 44:   /* Use the same row structure wherever feasible. */
 45:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 46:     ns_col[i] = ns_row[i];
 47:   }

 49:   /* if m < n; pad up the remainder with inode_limit */
 50:   for(; count+1 < n; count++,i++) {
 51:     ns_col[i] = 1;
 52:   }
 53:   /* The last node is the odd ball. padd it up with the remaining rows; */
 54:   if (count < n)  {
 55:     ns_col[i] = n - count;
 56:     i++;
 57:   } else if (count > n) {
 58:     /* Adjust for the over estimation */
 59:     ns_col[i-1] += n - count;
 60:   }
 61:   *size = i;
 62:   *ns   = ns_col;
 63:   return(0);
 64: }


 67: /*
 68:       This builds symmetric version of nonzero structure,
 69: */
 70: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
 71: {
 72:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 73:   int        *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
 74:   int        *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 77:   nslim_row = a->inode.node_count;
 78:   m         = A->m;
 79:   n         = A->n;
 80:   if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
 81: 
 82:   /* Use the row_inode as column_inode */
 83:   nslim_col = nslim_row;
 84:   ns_col    = ns_row;

 86:   /* allocate space for reformated inode structure */
 87:   PetscMalloc((nslim_col+1)*sizeof(int),&tns);
 88:   PetscMalloc((n+1)*sizeof(int),&tvc);
 89:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 91:   for (i1=0,col=0; i1<nslim_col; ++i1){
 92:     nsz = ns_col[i1];
 93:     for (i2=0; i2<nsz; ++i2,++col)
 94:       tvc[col] = i1;
 95:   }
 96:   /* allocate space for row pointers */
 97:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
 98:   *iia = ia;
 99:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
100:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

102:   /* determine the number of columns in each row */
103:   ia[0] = oshift;
104:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

106:     j    = aj + ai[row] + ishift;
107:     jmax = aj + ai[row+1] + ishift;
108:     i2   = 0;
109:     col  = *j++ + ishift;
110:     i2   = tvc[col];
111:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
112:       ia[i1+1]++;
113:       ia[i2+1]++;
114:       i2++;                     /* Start col of next node */
115:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
116:       i2 = tvc[col];
117:     }
118:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
119:   }

121:   /* shift ia[i] to point to next row */
122:   for (i1=1; i1<nslim_row+1; i1++) {
123:     row        = ia[i1-1];
124:     ia[i1]    += row;
125:     work[i1-1] = row - oshift;
126:   }

128:   /* allocate space for column pointers */
129:   nz   = ia[nslim_row] + (!ishift);
130:   PetscMalloc(nz*sizeof(int),&ja);
131:   *jja = ja;

133:  /* loop over lower triangular part putting into ja */
134:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
135:     j    = aj + ai[row] + ishift;
136:     jmax = aj + ai[row+1] + ishift;
137:     i2   = 0;                     /* Col inode index */
138:     col  = *j++ + ishift;
139:     i2   = tvc[col];
140:     while (i2<i1 && j<jmax) {
141:       ja[work[i2]++] = i1 + oshift;
142:       ja[work[i1]++] = i2 + oshift;
143:       ++i2;
144:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
145:       i2 = tvc[col];
146:     }
147:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

149:   }
150:   PetscFree(work);
151:   PetscFree(tns);
152:   PetscFree(tvc);
153:   return(0);
154: }

156: /*
157:       This builds nonsymmetric version of nonzero structure,
158: */
159: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
160: {
161:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
162:   int        *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
163:   int        *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

166:   nslim_row = a->inode.node_count;
167:   n         = A->n;

169:   /* Create The column_inode for this matrix */
170:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
171: 
172:   /* allocate space for reformated column_inode structure */
173:   PetscMalloc((nslim_col +1)*sizeof(int),&tns);
174:   PetscMalloc((n +1)*sizeof(int),&tvc);
175:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

177:   for (i1=0,col=0; i1<nslim_col; ++i1){
178:     nsz = ns_col[i1];
179:     for (i2=0; i2<nsz; ++i2,++col)
180:       tvc[col] = i1;
181:   }
182:   /* allocate space for row pointers */
183:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
184:   *iia = ia;
185:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
186:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

188:   /* determine the number of columns in each row */
189:   ia[0] = oshift;
190:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
191:     j   = aj + ai[row] + ishift;
192:     col = *j++ + ishift;
193:     i2  = tvc[col];
194:     nz  = ai[row+1] - ai[row];
195:     while (nz-- > 0) {           /* off-diagonal elemets */
196:       ia[i1+1]++;
197:       i2++;                     /* Start col of next node */
198:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
199:       i2 = tvc[col];
200:     }
201:   }

203:   /* shift ia[i] to point to next row */
204:   for (i1=1; i1<nslim_row+1; i1++) {
205:     row        = ia[i1-1];
206:     ia[i1]    += row;
207:     work[i1-1] = row - oshift;
208:   }

210:   /* allocate space for column pointers */
211:   nz   = ia[nslim_row] + (!ishift);
212:   PetscMalloc(nz*sizeof(int),&ja);
213:   *jja = ja;

215:  /* loop over matrix putting into ja */
216:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
217:     j   = aj + ai[row] + ishift;
218:     i2  = 0;                     /* Col inode index */
219:     col = *j++ + ishift;
220:     i2  = tvc[col];
221:     nz  = ai[row+1] - ai[row];
222:     while (nz-- > 0) {
223:       ja[work[i1]++] = i2 + oshift;
224:       ++i2;
225:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
226:       i2 = tvc[col];
227:     }
228:   }
229:   PetscFree(ns_col);
230:   PetscFree(work);
231:   PetscFree(tns);
232:   PetscFree(tvc);
233:   return(0);
234: }

236: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
237: {
238:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
239:   int        ierr,ishift;

242:   *n     = a->inode.node_count;
243:   if (!ia) return(0);

245:   ishift = a->indexshift;
246:   if (symmetric) {
247:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
248:   } else {
249:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
250:   }
251:   return(0);
252: }

254: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
255: {

259:   if (!ia) return(0);
260:   PetscFree(*ia);
261:   PetscFree(*ja);
262:   return(0);
263: }

265: /* ----------------------------------------------------------- */

267: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
268: {
269:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
270:   int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
271:   int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

274:   nslim_row = a->inode.node_count;
275:   n         = A->n;

277:   /* Create The column_inode for this matrix */
278:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
279: 
280:   /* allocate space for reformated column_inode structure */
281:   PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
282:   PetscMalloc((n + 1)*sizeof(int),&tvc);
283:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

285:   for (i1=0,col=0; i1<nslim_col; ++i1){
286:     nsz = ns_col[i1];
287:     for (i2=0; i2<nsz; ++i2,++col)
288:       tvc[col] = i1;
289:   }
290:   /* allocate space for column pointers */
291:   PetscMalloc((nslim_col+1)*sizeof(int),&ia);
292:   *iia = ia;
293:   PetscMemzero(ia,(nslim_col+1)*sizeof(int));
294:   PetscMalloc((nslim_col+1)*sizeof(int),&work);

296:   /* determine the number of columns in each row */
297:   ia[0] = oshift;
298:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
299:     j   = aj + ai[row] + ishift;
300:     col = *j++ + ishift;
301:     i2  = tvc[col];
302:     nz  = ai[row+1] - ai[row];
303:     while (nz-- > 0) {           /* off-diagonal elemets */
304:       /* ia[i1+1]++; */
305:       ia[i2+1]++;
306:       i2++;
307:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
308:       i2 = tvc[col];
309:     }
310:   }

312:   /* shift ia[i] to point to next col */
313:   for (i1=1; i1<nslim_col+1; i1++) {
314:     col        = ia[i1-1];
315:     ia[i1]    += col;
316:     work[i1-1] = col - oshift;
317:   }

319:   /* allocate space for column pointers */
320:   nz   = ia[nslim_col] + (!ishift);
321:   PetscMalloc(nz*sizeof(int),&ja);
322:   *jja = ja;

324:  /* loop over matrix putting into ja */
325:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
326:     j   = aj + ai[row] + ishift;
327:     i2  = 0;                     /* Col inode index */
328:     col = *j++ + ishift;
329:     i2  = tvc[col];
330:     nz  = ai[row+1] - ai[row];
331:     while (nz-- > 0) {
332:       /* ja[work[i1]++] = i2 + oshift; */
333:       ja[work[i2]++] = i1 + oshift;
334:       i2++;
335:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
336:       i2 = tvc[col];
337:     }
338:   }
339:   PetscFree(ns_col);
340:   PetscFree(work);
341:   PetscFree(tns);
342:   PetscFree(tvc);
343:   return(0);
344: }

346: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
347: {
348:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
349:   int        ierr,ishift;

352:   Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
353:   if (!ia) return(0);

355:   ishift = a->indexshift;
356:   if (symmetric) {
357:     /* Since the indices are symmetric it does'nt matter */
358:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
359:   } else {
360:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
361:   }
362:   return(0);
363: }

365: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
366: {

370:   if (!ia) return(0);
371:   PetscFree(*ia);
372:   PetscFree(*ja);
373:   return(0);
374: }

376: /* ----------------------------------------------------------- */

378: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
379: {
380:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
381:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
382:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y;
383:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
384:   int          shift = a->indexshift;
385: 
386: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
387: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
388: #endif

391:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
392:   node_max = a->inode.node_count;
393:   ns       = a->inode.size;     /* Node Size array */
394:   VecGetArray(xx,&x);
395:   VecGetArray(yy,&y);
396:   x    = x + shift;             /* shift for Fortran start by 1 indexing */
397:   idx  = a->j;
398:   v1   = a->a;
399:   ii   = a->i;

401:   for (i = 0,row = 0; i< node_max; ++i){
402:     nsz  = ns[i];
403:     n    = ii[1] - ii[0];
404:     ii  += nsz;
405:     sz   = n;                   /*No of non zeros in this row */
406:                                 /* Switch on the size of Node */
407:     switch (nsz){               /* Each loop in 'case' is unrolled */
408:     case 1 :
409:       sum1  = 0;
410: 
411:       for(n = 0; n< sz-1; n+=2) {
412:         i1   = idx[0];          /* The instructions are ordered to */
413:         i2   = idx[1];          /* make the compiler's job easy */
414:         idx += 2;
415:         tmp0 = x[i1];
416:         tmp1 = x[i2];
417:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
418:        }
419: 
420:       if (n == sz-1){          /* Take care of the last nonzero  */
421:         tmp0  = x[*idx++];
422:         sum1 += *v1++ * tmp0;
423:       }
424:       y[row++]=sum1;
425:       break;
426:     case 2:
427:       sum1  = 0;
428:       sum2  = 0;
429:       v2    = v1 + n;
430: 
431:       for (n = 0; n< sz-1; n+=2) {
432:         i1   = idx[0];
433:         i2   = idx[1];
434:         idx += 2;
435:         tmp0 = x[i1];
436:         tmp1 = x[i2];
437:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
439:       }
440:       if (n == sz-1){
441:         tmp0  = x[*idx++];
442:         sum1 += *v1++ * tmp0;
443:         sum2 += *v2++ * tmp0;
444:       }
445:       y[row++]=sum1;
446:       y[row++]=sum2;
447:       v1      =v2;              /* Since the next block to be processed starts there*/
448:       idx    +=sz;
449:       break;
450:     case 3:
451:       sum1  = 0;
452:       sum2  = 0;
453:       sum3  = 0;
454:       v2    = v1 + n;
455:       v3    = v2 + n;
456: 
457:       for (n = 0; n< sz-1; n+=2) {
458:         i1   = idx[0];
459:         i2   = idx[1];
460:         idx += 2;
461:         tmp0 = x[i1];
462:         tmp1 = x[i2];
463:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
464:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
465:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
466:       }
467:       if (n == sz-1){
468:         tmp0  = x[*idx++];
469:         sum1 += *v1++ * tmp0;
470:         sum2 += *v2++ * tmp0;
471:         sum3 += *v3++ * tmp0;
472:       }
473:       y[row++]=sum1;
474:       y[row++]=sum2;
475:       y[row++]=sum3;
476:       v1       =v3;             /* Since the next block to be processed starts there*/
477:       idx     +=2*sz;
478:       break;
479:     case 4:
480:       sum1  = 0;
481:       sum2  = 0;
482:       sum3  = 0;
483:       sum4  = 0;
484:       v2    = v1 + n;
485:       v3    = v2 + n;
486:       v4    = v3 + n;
487: 
488:       for (n = 0; n< sz-1; n+=2) {
489:         i1   = idx[0];
490:         i2   = idx[1];
491:         idx += 2;
492:         tmp0 = x[i1];
493:         tmp1 = x[i2];
494:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
495:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
496:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
497:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
498:       }
499:       if (n == sz-1){
500:         tmp0  = x[*idx++];
501:         sum1 += *v1++ * tmp0;
502:         sum2 += *v2++ * tmp0;
503:         sum3 += *v3++ * tmp0;
504:         sum4 += *v4++ * tmp0;
505:       }
506:       y[row++]=sum1;
507:       y[row++]=sum2;
508:       y[row++]=sum3;
509:       y[row++]=sum4;
510:       v1      =v4;              /* Since the next block to be processed starts there*/
511:       idx    +=3*sz;
512:       break;
513:     case 5:
514:       sum1  = 0;
515:       sum2  = 0;
516:       sum3  = 0;
517:       sum4  = 0;
518:       sum5  = 0;
519:       v2    = v1 + n;
520:       v3    = v2 + n;
521:       v4    = v3 + n;
522:       v5    = v4 + n;
523: 
524:       for (n = 0; n<sz-1; n+=2) {
525:         i1   = idx[0];
526:         i2   = idx[1];
527:         idx += 2;
528:         tmp0 = x[i1];
529:         tmp1 = x[i2];
530:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
531:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
532:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
533:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
534:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
535:       }
536:       if (n == sz-1){
537:         tmp0  = x[*idx++];
538:         sum1 += *v1++ * tmp0;
539:         sum2 += *v2++ * tmp0;
540:         sum3 += *v3++ * tmp0;
541:         sum4 += *v4++ * tmp0;
542:         sum5 += *v5++ * tmp0;
543:       }
544:       y[row++]=sum1;
545:       y[row++]=sum2;
546:       y[row++]=sum3;
547:       y[row++]=sum4;
548:       y[row++]=sum5;
549:       v1      =v5;       /* Since the next block to be processed starts there */
550:       idx    +=4*sz;
551:       break;
552:     default :
553:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
554:     }
555:   }
556:   VecRestoreArray(xx,&x);
557:   VecRestoreArray(yy,&y);
558:   PetscLogFlops(2*a->nz - A->m);
559:   return(0);
560: }
561: /* ----------------------------------------------------------- */
562: /* Almost same code as the MatMult_SeqAij_Inode() */
563: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
564: {
565:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
566:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
567:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
568:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
569:   int          shift = a->indexshift;
570: 
572:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
573:   node_max = a->inode.node_count;
574:   ns       = a->inode.size;     /* Node Size array */
575:   VecGetArray(xx,&x);
576:   VecGetArray(yy,&y);
577:   if (zz != yy) {
578:     VecGetArray(zz,&z);
579:   } else {
580:     z = y;
581:   }
582:   zt = z;

584:   x    = x + shift;             /* shift for Fortran start by 1 indexing */
585:   idx  = a->j;
586:   v1   = a->a;
587:   ii   = a->i;

589:   for (i = 0,row = 0; i< node_max; ++i){
590:     nsz  = ns[i];
591:     n    = ii[1] - ii[0];
592:     ii  += nsz;
593:     sz   = n;                   /*No of non zeros in this row */
594:                                 /* Switch on the size of Node */
595:     switch (nsz){               /* Each loop in 'case' is unrolled */
596:     case 1 :
597:       sum1  = *zt++;
598: 
599:       for(n = 0; n< sz-1; n+=2) {
600:         i1   = idx[0];          /* The instructions are ordered to */
601:         i2   = idx[1];          /* make the compiler's job easy */
602:         idx += 2;
603:         tmp0 = x[i1];
604:         tmp1 = x[i2];
605:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
606:        }
607: 
608:       if(n   == sz-1){          /* Take care of the last nonzero  */
609:         tmp0  = x[*idx++];
610:         sum1 += *v1++ * tmp0;
611:       }
612:       y[row++]=sum1;
613:       break;
614:     case 2:
615:       sum1  = *zt++;
616:       sum2  = *zt++;
617:       v2    = v1 + n;
618: 
619:       for(n = 0; n< sz-1; n+=2) {
620:         i1   = idx[0];
621:         i2   = idx[1];
622:         idx += 2;
623:         tmp0 = x[i1];
624:         tmp1 = x[i2];
625:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
626:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
627:       }
628:       if(n   == sz-1){
629:         tmp0  = x[*idx++];
630:         sum1 += *v1++ * tmp0;
631:         sum2 += *v2++ * tmp0;
632:       }
633:       y[row++]=sum1;
634:       y[row++]=sum2;
635:       v1      =v2;              /* Since the next block to be processed starts there*/
636:       idx    +=sz;
637:       break;
638:     case 3:
639:       sum1  = *zt++;
640:       sum2  = *zt++;
641:       sum3  = *zt++;
642:       v2    = v1 + n;
643:       v3    = v2 + n;
644: 
645:       for (n = 0; n< sz-1; n+=2) {
646:         i1   = idx[0];
647:         i2   = idx[1];
648:         idx += 2;
649:         tmp0 = x[i1];
650:         tmp1 = x[i2];
651:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
652:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
653:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
654:       }
655:       if (n == sz-1){
656:         tmp0  = x[*idx++];
657:         sum1 += *v1++ * tmp0;
658:         sum2 += *v2++ * tmp0;
659:         sum3 += *v3++ * tmp0;
660:       }
661:       y[row++]=sum1;
662:       y[row++]=sum2;
663:       y[row++]=sum3;
664:       v1       =v3;             /* Since the next block to be processed starts there*/
665:       idx     +=2*sz;
666:       break;
667:     case 4:
668:       sum1  = *zt++;
669:       sum2  = *zt++;
670:       sum3  = *zt++;
671:       sum4  = *zt++;
672:       v2    = v1 + n;
673:       v3    = v2 + n;
674:       v4    = v3 + n;
675: 
676:       for (n = 0; n< sz-1; n+=2) {
677:         i1   = idx[0];
678:         i2   = idx[1];
679:         idx += 2;
680:         tmp0 = x[i1];
681:         tmp1 = x[i2];
682:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
683:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
684:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
685:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
686:       }
687:       if (n == sz-1){
688:         tmp0  = x[*idx++];
689:         sum1 += *v1++ * tmp0;
690:         sum2 += *v2++ * tmp0;
691:         sum3 += *v3++ * tmp0;
692:         sum4 += *v4++ * tmp0;
693:       }
694:       y[row++]=sum1;
695:       y[row++]=sum2;
696:       y[row++]=sum3;
697:       y[row++]=sum4;
698:       v1      =v4;              /* Since the next block to be processed starts there*/
699:       idx    +=3*sz;
700:       break;
701:     case 5:
702:       sum1  = *zt++;
703:       sum2  = *zt++;
704:       sum3  = *zt++;
705:       sum4  = *zt++;
706:       sum5  = *zt++;
707:       v2    = v1 + n;
708:       v3    = v2 + n;
709:       v4    = v3 + n;
710:       v5    = v4 + n;
711: 
712:       for (n = 0; n<sz-1; n+=2) {
713:         i1   = idx[0];
714:         i2   = idx[1];
715:         idx += 2;
716:         tmp0 = x[i1];
717:         tmp1 = x[i2];
718:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
719:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
720:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
721:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
722:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
723:       }
724:       if(n   == sz-1){
725:         tmp0  = x[*idx++];
726:         sum1 += *v1++ * tmp0;
727:         sum2 += *v2++ * tmp0;
728:         sum3 += *v3++ * tmp0;
729:         sum4 += *v4++ * tmp0;
730:         sum5 += *v5++ * tmp0;
731:       }
732:       y[row++]=sum1;
733:       y[row++]=sum2;
734:       y[row++]=sum3;
735:       y[row++]=sum4;
736:       y[row++]=sum5;
737:       v1      =v5;       /* Since the next block to be processed starts there */
738:       idx    +=4*sz;
739:       break;
740:     default :
741:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
742:     }
743:   }
744:   VecRestoreArray(xx,&x);
745:   VecRestoreArray(yy,&y);
746:   if (zz != yy) {
747:     VecRestoreArray(zz,&z);
748:   }
749:   PetscLogFlops(2*a->nz);
750:   return(0);
751: }
752: /* ----------------------------------------------------------- */
753: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,int *,ISColoring *);

755: /*
756:     samestructure indicates that the matrix has not changed its nonzero structure so we 
757:     do not need to recompute the inodes 
758: */
759: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
760: {
761:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
762:   int        ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
763:   PetscTruth flag,flg;

766:   if (samestructure && a->inode.checked) return(0);

768:   a->inode.checked = PETSC_TRUE;

770:   /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
771:   if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODESn"); return(0);}
772:   PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
773:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inoden");return(0);}
774:   PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
775:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unrolln");return(0);}
776:   PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
777:   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
778:   m = A->m;
779:   if (a->inode.size) {ns = a->inode.size;}
780:   else {PetscMalloc((m+1)*sizeof(int),&ns);}

782:   i          = 0;
783:   node_count = 0;
784:   idx        = a->j;
785:   ii         = a->i;
786:   while (i < m){                /* For each row */
787:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
788:     /* Limits the number of elements in a node to 'a->inode.limit' */
789:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
790:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
791:       if (nzy != nzx) break;
792:       idy  += nzx;             /* Same nonzero pattern */
793:       PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
794:       if (flag == PETSC_FALSE) break;
795:     }
796:     ns[node_count++] = blk_size;
797:     idx += blk_size*nzx;
798:     i    = j;
799:   }
800:   /* If not enough inodes found,, do not use inode version of the routines */
801:   if (!a->inode.size && m && node_count > 0.9*m) {
802:     PetscFree(ns);
803:     A->ops->mult            = MatMult_SeqAIJ;
804:     A->ops->multadd         = MatMultAdd_SeqAIJ;
805:     A->ops->solve           = MatSolve_SeqAIJ;
806:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
807:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ;
808:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ;
809:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ;
810:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
811:     A->ops->coloringpatch   = 0;
812:     a->inode.node_count     = 0;
813:     a->inode.size           = 0;
814:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routinesn",node_count,m);
815:   } else {
816:     A->ops->mult            = MatMult_SeqAIJ_Inode;
817:     A->ops->multadd         = MatMultAdd_SeqAIJ_Inode;
818:     A->ops->solve           = MatSolve_SeqAIJ_Inode;
819:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
820:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
821:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
822:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
823:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
824:     A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
825:     a->inode.node_count     = node_count;
826:     a->inode.size           = ns;
827:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routinesn",node_count,m,a->inode.limit);
828:   }
829:   return(0);
830: }

832: /* ----------------------------------------------------------- */
833: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
834: {
835:   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data;
836:   IS          iscol = a->col,isrow = a->row;
837:   int         *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,shift = a->indexshift,*a_j = a->j;
838:   int         node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
839:   PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
840:   PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

843:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
844:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
845:   node_max = a->inode.node_count;
846:   ns       = a->inode.size;     /* Node Size array */

848:   VecGetArray(bb,&b);
849:   VecGetArray(xx,&x);
850:   tmp  = a->solve_work;
851: 
852:   ISGetIndices(isrow,&rout); r = rout;
853:   ISGetIndices(iscol,&cout); c = cout + (n-1);
854: 
855:   /* forward solve the lower triangular */
856:   tmps = tmp + shift;
857:   aa   = a_a +shift;
858:   aj   = a_j + shift;
859:   ad   = a->diag;

861:   for (i = 0,row = 0; i< node_max; ++i){
862:     nsz = ns[i];
863:     aii = ai[row];
864:     v1  = aa + aii;
865:     vi  = aj + aii;
866:     nz  = ad[row]- aii;
867: 
868:     switch (nsz){               /* Each loop in 'case' is unrolled */
869:     case 1 :
870:       sum1 = b[*r++];
871:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
872:       for(j=0; j<nz-1; j+=2){
873:         i0   = vi[0];
874:         i1   = vi[1];
875:         vi  +=2;
876:         tmp0 = tmps[i0];
877:         tmp1 = tmps[i1];
878:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879:       }
880:       if(j == nz-1){
881:         tmp0 = tmps[*vi++];
882:         sum1 -= *v1++ *tmp0;
883:       }
884:       tmp[row ++]=sum1;
885:       break;
886:     case 2:
887:       sum1 = b[*r++];
888:       sum2 = b[*r++];
889:       v2   = aa + ai[row+1];

891:       for(j=0; j<nz-1; j+=2){
892:         i0   = vi[0];
893:         i1   = vi[1];
894:         vi  +=2;
895:         tmp0 = tmps[i0];
896:         tmp1 = tmps[i1];
897:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
898:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
899:       }
900:       if(j == nz-1){
901:         tmp0 = tmps[*vi++];
902:         sum1 -= *v1++ *tmp0;
903:         sum2 -= *v2++ *tmp0;
904:       }
905:       sum2 -= *v2++ * sum1;
906:       tmp[row ++]=sum1;
907:       tmp[row ++]=sum2;
908:       break;
909:     case 3:
910:       sum1 = b[*r++];
911:       sum2 = b[*r++];
912:       sum3 = b[*r++];
913:       v2   = aa + ai[row+1];
914:       v3   = aa + ai[row+2];
915: 
916:       for (j=0; j<nz-1; j+=2){
917:         i0   = vi[0];
918:         i1   = vi[1];
919:         vi  +=2;
920:         tmp0 = tmps[i0];
921:         tmp1 = tmps[i1];
922:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
923:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
924:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
925:       }
926:       if (j == nz-1){
927:         tmp0 = tmps[*vi++];
928:         sum1 -= *v1++ *tmp0;
929:         sum2 -= *v2++ *tmp0;
930:         sum3 -= *v3++ *tmp0;
931:       }
932:       sum2 -= *v2++ * sum1;
933:       sum3 -= *v3++ * sum1;
934:       sum3 -= *v3++ * sum2;
935:       tmp[row ++]=sum1;
936:       tmp[row ++]=sum2;
937:       tmp[row ++]=sum3;
938:       break;
939: 
940:     case 4:
941:       sum1 = b[*r++];
942:       sum2 = b[*r++];
943:       sum3 = b[*r++];
944:       sum4 = b[*r++];
945:       v2   = aa + ai[row+1];
946:       v3   = aa + ai[row+2];
947:       v4   = aa + ai[row+3];
948: 
949:       for (j=0; j<nz-1; j+=2){
950:         i0   = vi[0];
951:         i1   = vi[1];
952:         vi  +=2;
953:         tmp0 = tmps[i0];
954:         tmp1 = tmps[i1];
955:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
956:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
957:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
958:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
959:       }
960:       if (j == nz-1){
961:         tmp0 = tmps[*vi++];
962:         sum1 -= *v1++ *tmp0;
963:         sum2 -= *v2++ *tmp0;
964:         sum3 -= *v3++ *tmp0;
965:         sum4 -= *v4++ *tmp0;
966:       }
967:       sum2 -= *v2++ * sum1;
968:       sum3 -= *v3++ * sum1;
969:       sum4 -= *v4++ * sum1;
970:       sum3 -= *v3++ * sum2;
971:       sum4 -= *v4++ * sum2;
972:       sum4 -= *v4++ * sum3;
973: 
974:       tmp[row ++]=sum1;
975:       tmp[row ++]=sum2;
976:       tmp[row ++]=sum3;
977:       tmp[row ++]=sum4;
978:       break;
979:     case 5:
980:       sum1 = b[*r++];
981:       sum2 = b[*r++];
982:       sum3 = b[*r++];
983:       sum4 = b[*r++];
984:       sum5 = b[*r++];
985:       v2   = aa + ai[row+1];
986:       v3   = aa + ai[row+2];
987:       v4   = aa + ai[row+3];
988:       v5   = aa + ai[row+4];
989: 
990:       for (j=0; j<nz-1; j+=2){
991:         i0   = vi[0];
992:         i1   = vi[1];
993:         vi  +=2;
994:         tmp0 = tmps[i0];
995:         tmp1 = tmps[i1];
996:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
997:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
998:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
999:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1000:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1001:       }
1002:       if (j == nz-1){
1003:         tmp0 = tmps[*vi++];
1004:         sum1 -= *v1++ *tmp0;
1005:         sum2 -= *v2++ *tmp0;
1006:         sum3 -= *v3++ *tmp0;
1007:         sum4 -= *v4++ *tmp0;
1008:         sum5 -= *v5++ *tmp0;
1009:       }

1011:       sum2 -= *v2++ * sum1;
1012:       sum3 -= *v3++ * sum1;
1013:       sum4 -= *v4++ * sum1;
1014:       sum5 -= *v5++ * sum1;
1015:       sum3 -= *v3++ * sum2;
1016:       sum4 -= *v4++ * sum2;
1017:       sum5 -= *v5++ * sum2;
1018:       sum4 -= *v4++ * sum3;
1019:       sum5 -= *v5++ * sum3;
1020:       sum5 -= *v5++ * sum4;
1021: 
1022:       tmp[row ++]=sum1;
1023:       tmp[row ++]=sum2;
1024:       tmp[row ++]=sum3;
1025:       tmp[row ++]=sum4;
1026:       tmp[row ++]=sum5;
1027:       break;
1028:     default:
1029:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1030:     }
1031:   }
1032:   /* backward solve the upper triangular */
1033:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1034:     nsz = ns[i];
1035:     aii = ai[row+1] -1;
1036:     v1  = aa + aii;
1037:     vi  = aj + aii;
1038:     nz  = aii- ad[row];
1039:     switch (nsz){               /* Each loop in 'case' is unrolled */
1040:     case 1 :
1041:       sum1 = tmp[row];

1043:       for(j=nz ; j>1; j-=2){
1044:         i0   = vi[0];
1045:         i1   = vi[-1];
1046:         vi  -=2;
1047:         tmp0 = tmps[i0];
1048:         tmp1 = tmps[i1];
1049:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1050:       }
1051:       if (j==1){
1052:         tmp0  = tmps[*vi--];
1053:         sum1 -= *v1-- * tmp0;
1054:       }
1055:       x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1056:       break;
1057:     case 2 :
1058:       sum1 = tmp[row];
1059:       sum2 = tmp[row -1];
1060:       v2   = aa + ai[row]-1;
1061:       for (j=nz ; j>1; j-=2){
1062:         i0   = vi[0];
1063:         i1   = vi[-1];
1064:         vi  -=2;
1065:         tmp0 = tmps[i0];
1066:         tmp1 = tmps[i1];
1067:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1068:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1069:       }
1070:       if (j==1){
1071:         tmp0  = tmps[*vi--];
1072:         sum1 -= *v1-- * tmp0;
1073:         sum2 -= *v2-- * tmp0;
1074:       }
1075: 
1076:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1077:       sum2   -= *v2-- * tmp0;
1078:       x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1079:       break;
1080:     case 3 :
1081:       sum1 = tmp[row];
1082:       sum2 = tmp[row -1];
1083:       sum3 = tmp[row -2];
1084:       v2   = aa + ai[row]-1;
1085:       v3   = aa + ai[row -1]-1;
1086:       for (j=nz ; j>1; j-=2){
1087:         i0   = vi[0];
1088:         i1   = vi[-1];
1089:         vi  -=2;
1090:         tmp0 = tmps[i0];
1091:         tmp1 = tmps[i1];
1092:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1093:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1094:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1095:       }
1096:       if (j==1){
1097:         tmp0  = tmps[*vi--];
1098:         sum1 -= *v1-- * tmp0;
1099:         sum2 -= *v2-- * tmp0;
1100:         sum3 -= *v3-- * tmp0;
1101:       }
1102:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1103:       sum2   -= *v2-- * tmp0;
1104:       sum3   -= *v3-- * tmp0;
1105:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1106:       sum3   -= *v3-- * tmp0;
1107:       x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1108: 
1109:       break;
1110:     case 4 :
1111:       sum1 = tmp[row];
1112:       sum2 = tmp[row -1];
1113:       sum3 = tmp[row -2];
1114:       sum4 = tmp[row -3];
1115:       v2   = aa + ai[row]-1;
1116:       v3   = aa + ai[row -1]-1;
1117:       v4   = aa + ai[row -2]-1;

1119:       for (j=nz ; j>1; j-=2){
1120:         i0   = vi[0];
1121:         i1   = vi[-1];
1122:         vi  -=2;
1123:         tmp0 = tmps[i0];
1124:         tmp1 = tmps[i1];
1125:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1126:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1127:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1128:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1129:       }
1130:       if (j==1){
1131:         tmp0  = tmps[*vi--];
1132:         sum1 -= *v1-- * tmp0;
1133:         sum2 -= *v2-- * tmp0;
1134:         sum3 -= *v3-- * tmp0;
1135:         sum4 -= *v4-- * tmp0;
1136:       }

1138:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1139:       sum2   -= *v2-- * tmp0;
1140:       sum3   -= *v3-- * tmp0;
1141:       sum4   -= *v4-- * tmp0;
1142:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1143:       sum3   -= *v3-- * tmp0;
1144:       sum4   -= *v4-- * tmp0;
1145:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1146:       sum4   -= *v4-- * tmp0;
1147:       x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1148:       break;
1149:     case 5 :
1150:       sum1 = tmp[row];
1151:       sum2 = tmp[row -1];
1152:       sum3 = tmp[row -2];
1153:       sum4 = tmp[row -3];
1154:       sum5 = tmp[row -4];
1155:       v2   = aa + ai[row]-1;
1156:       v3   = aa + ai[row -1]-1;
1157:       v4   = aa + ai[row -2]-1;
1158:       v5   = aa + ai[row -3]-1;
1159:       for (j=nz ; j>1; j-=2){
1160:         i0   = vi[0];
1161:         i1   = vi[-1];
1162:         vi  -=2;
1163:         tmp0 = tmps[i0];
1164:         tmp1 = tmps[i1];
1165:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1166:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1167:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1168:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1169:         sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1170:       }
1171:       if (j==1){
1172:         tmp0  = tmps[*vi--];
1173:         sum1 -= *v1-- * tmp0;
1174:         sum2 -= *v2-- * tmp0;
1175:         sum3 -= *v3-- * tmp0;
1176:         sum4 -= *v4-- * tmp0;
1177:         sum5 -= *v5-- * tmp0;
1178:       }

1180:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1181:       sum2   -= *v2-- * tmp0;
1182:       sum3   -= *v3-- * tmp0;
1183:       sum4   -= *v4-- * tmp0;
1184:       sum5   -= *v5-- * tmp0;
1185:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1186:       sum3   -= *v3-- * tmp0;
1187:       sum4   -= *v4-- * tmp0;
1188:       sum5   -= *v5-- * tmp0;
1189:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1190:       sum4   -= *v4-- * tmp0;
1191:       sum5   -= *v5-- * tmp0;
1192:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1193:       sum5   -= *v5-- * tmp0;
1194:       x[*c--] = tmp[row] = sum5*a_a[ad[row]+shift]; row--;
1195:       break;
1196:     default:
1197:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1198:     }
1199:   }
1200:   ISRestoreIndices(isrow,&rout);
1201:   ISRestoreIndices(iscol,&cout);
1202:   VecRestoreArray(bb,&b);
1203:   VecRestoreArray(xx,&x);
1204:   PetscLogFlops(2*a->nz - A->n);
1205:   return(0);
1206: }


1209: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1210: {
1211:   Mat          C = *B;
1212:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1213:   IS           iscol = b->col,isrow = b->row,isicol = b->icol;
1214:   int          shift = a->indexshift,*r,*ic,*c,ierr,n = A->m,*bi = b->i;
1215:   int          *bj = b->j+shift,*nbj=b->j +(!shift),*ajtmp,*bjtmp,nz,row,prow;
1216:   int          *ics,i,j,idx,*ai = a->i,*aj = a->j+shift,*bd = b->diag,node_max,nsz;
1217:   int          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1218:   PetscScalar  *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1219:   PetscScalar  tmp,*ba = b->a+shift,*aa = a->a+shift,*pv,*rtmps1,*rtmps2,*rtmps3;
1220:   PetscTruth   damp;
1221:   PetscReal    damping = b->lu_damping,zeropivot = b->lu_zeropivot;

1224:   ierr   = ISGetIndices(isrow,&r);
1225:   ierr   = ISGetIndices(iscol,&c);
1226:   ierr   = ISGetIndices(isicol,&ic);
1227:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1228:   ierr   = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1229:   ics    = ic + shift; rtmps1 = rtmp1 + shift;
1230:   rtmp2  = rtmp1 + n;  rtmps2 = rtmp2 + shift;
1231:   rtmp3  = rtmp2 + n;  rtmps3 = rtmp3 + shift;
1232: 
1233:   node_max = a->inode.node_count;
1234:   ns       = a->inode.size ;
1235:   if (!ns){
1236:     SETERRQ(1,"Matrix without inode information");
1237:   }

1239:   /* If max inode size > 3, split it into two inodes.*/
1240:   /* also map the inode sizes according to the ordering */
1241:   PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1242:   for (i=0,j=0; i<node_max; ++i,++j){
1243:     if (ns[i]>3) {
1244:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1245:       ++j;
1246:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247:     } else {
1248:       tmp_vec1[j] = ns[i];
1249:     }
1250:   }
1251:   /* Use the correct node_max */
1252:   node_max = j;

1254:   /* Now reorder the inode info based on mat re-ordering info */
1255:   /* First create a row -> inode_size_array_index map */
1256:   PetscMalloc(n*sizeof(int)+1,&nsmap);
1257:   PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1258:   for (i=0,row=0; i<node_max; i++) {
1259:     nsz = tmp_vec1[i];
1260:     for (j=0; j<nsz; j++,row++) {
1261:       nsmap[row] = i;
1262:     }
1263:   }
1264:   /* Using nsmap, create a reordered ns structure */
1265:   for (i=0,j=0; i< node_max; i++) {
1266:     nsz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1267:     tmp_vec2[i] = nsz;
1268:     j        += nsz;
1269:   }
1270:   PetscFree(nsmap);
1271:   PetscFree(tmp_vec1);
1272:   /* Now use the correct ns */
1273:   ns = tmp_vec2;


1276:   do {
1277:     damp = PETSC_FALSE;
1278:     /* Now loop over each block-row, and do the factorization */
1279:     for (i=0,row=0; i<node_max; i++) {
1280:       nsz   = ns[i];
1281:       nz    = bi[row+1] - bi[row];
1282:       bjtmp = bj + bi[row];
1283: 
1284:       switch (nsz){
1285:       case 1:
1286:         for  (j=0; j<nz; j++){
1287:           idx         = bjtmp[j];
1288:           rtmps1[idx] = 0.0;
1289:         }
1290: 
1291:         /* load in initial (unfactored row) */
1292:         idx   = r[row];
1293:         nz    = ai[idx+1] - ai[idx];
1294:         ajtmp = aj + ai[idx];
1295:         v1    = aa + ai[idx];

1297:         for (j=0; j<nz; j++) {
1298:           idx        = ics[ajtmp[j]];
1299:           rtmp1[idx] = v1[j];
1300:           if (ajtmp[j] == r[row]) {
1301:             rtmp1[idx] += damping;
1302:           }
1303:         }
1304:         prow = *bjtmp++ + shift;
1305:         while (prow < row) {
1306:           pc1 = rtmp1 + prow;
1307:           if (*pc1 != 0.0){
1308:             pv   = ba + bd[prow];
1309:             pj   = nbj + bd[prow];
1310:             mul1 = *pc1 * *pv++;
1311:             *pc1 = mul1;
1312:             nz   = bi[prow+1] - bd[prow] - 1;
1313:             PetscLogFlops(2*nz);
1314:             for (j=0; j<nz; j++) {
1315:               tmp = pv[j];
1316:               idx = pj[j];
1317:               rtmps1[idx] -= mul1 * tmp;
1318:             }
1319:           }
1320:           prow = *bjtmp++ + shift;
1321:         }
1322:         nz  = bi[row+1] - bi[row];
1323:         pj  = bj + bi[row];
1324:         pc1 = ba + bi[row];
1325:         if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1326:           if (b->lu_damping) {
1327:             damp = PETSC_TRUE;
1328:             if (damping) damping *= 2.0;
1329:             else damping = 1.e-12;
1330:             ndamp++;
1331:             goto endofwhile;
1332:           } else {
1333:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1334:           }
1335:         }
1336:         rtmp1[row] = 1.0/rtmp1[row];
1337:         for (j=0; j<nz; j++) {
1338:           idx    = pj[j];
1339:           pc1[j] = rtmps1[idx];
1340:         }
1341:         break;
1342: 
1343:       case 2:
1344:         for  (j=0; j<nz; j++) {
1345:           idx         = bjtmp[j];
1346:           rtmps1[idx] = 0.0;
1347:           rtmps2[idx] = 0.0;
1348:         }
1349: 
1350:         /* load in initial (unfactored row) */
1351:         idx   = r[row];
1352:         nz    = ai[idx+1] - ai[idx];
1353:         ajtmp = aj + ai[idx];
1354:         v1    = aa + ai[idx];
1355:         v2    = aa + ai[idx+1];
1356: 
1357:         for (j=0; j<nz; j++) {
1358:           idx        = ics[ajtmp[j]];
1359:           rtmp1[idx] = v1[j];
1360:           rtmp2[idx] = v2[j];
1361:           if (ajtmp[j] == r[row]) {
1362:             rtmp1[idx] += damping;
1363:           }
1364:           if (ajtmp[j] == r[row+1]) {
1365:             rtmp2[idx] += damping;
1366:           }
1367:         }
1368:         prow = *bjtmp++ + shift;
1369:         while (prow < row) {
1370:           pc1 = rtmp1 + prow;
1371:           pc2 = rtmp2 + prow;
1372:           if (*pc1 != 0.0 || *pc2 != 0.0){
1373:             pv   = ba + bd[prow];
1374:             pj   = nbj + bd[prow];
1375:             mul1 = *pc1 * *pv;
1376:             mul2 = *pc2 * *pv;
1377:             ++pv;
1378:             *pc1 = mul1;
1379:             *pc2 = mul2;
1380: 
1381:             nz   = bi[prow+1] - bd[prow] - 1;
1382:             PetscLogFlops(2*2*nz);
1383:             for (j=0; j<nz; j++) {
1384:               tmp = pv[j];
1385:               idx = pj[j];
1386:               rtmps1[idx] -= mul1 * tmp;
1387:               rtmps2[idx] -= mul2 * tmp;
1388:             }
1389:           }
1390:           prow = *bjtmp++ + shift;
1391:         }
1392:         /* Now take care of the odd element*/
1393:         pc1 = rtmp1 + prow;
1394:         pc2 = rtmp2 + prow;
1395:         if (*pc2 != 0.0){
1396:           pj   = nbj + bd[prow];
1397:           if (PetscAbsScalar(*pc1) < zeropivot) {
1398:             if (b->lu_damping) {
1399:               damp = PETSC_TRUE;
1400:               if (damping) damping *= 2.0;
1401:               else damping = 1.e-12;
1402:               ndamp++;
1403:               goto endofwhile;
1404:             } else {
1405:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1406:             }
1407:           }
1408:           mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1409:           *pc2 = mul2;
1410:           nz   = bi[prow+1] - bd[prow] - 1;
1411:           PetscLogFlops(2*nz);
1412:           for (j=0; j<nz; j++) {
1413:             idx = pj[j] + shift;
1414:             tmp = rtmp1[idx];
1415:             rtmp2[idx] -= mul2 * tmp;
1416:           }
1417:         }
1418: 
1419:         nz  = bi[row+1] - bi[row];
1420:         pj  = bj + bi[row];
1421:         pc1 = ba + bi[row];
1422:         pc2 = ba + bi[row+1];
1423:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1424:           if (b->lu_damping) {
1425:             damp = PETSC_TRUE;
1426:             if (damping) damping *= 2.0;
1427:             else damping = 1.e-12;
1428:             ndamp++;
1429:             goto endofwhile;
1430:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1431:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1432:           } else {
1433:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1434:           }
1435:         }
1436:         rtmp1[row]   = 1.0/rtmp1[row];
1437:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1438:         for (j=0; j<nz; j++) {
1439:           idx    = pj[j];
1440:           pc1[j] = rtmps1[idx];
1441:           pc2[j] = rtmps2[idx];
1442:         }
1443:         break;

1445:       case 3:
1446:         for  (j=0; j<nz; j++) {
1447:           idx         = bjtmp[j];
1448:           rtmps1[idx] = 0.0;
1449:           rtmps2[idx] = 0.0;
1450:           rtmps3[idx] = 0.0;
1451:         }
1452:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1453:         idx   = r[row];
1454:         nz    = ai[idx+1] - ai[idx];
1455:         ajtmp = aj + ai[idx];
1456:         v1    = aa + ai[idx];
1457:         v2    = aa + ai[idx+1];
1458:         v3    = aa + ai[idx+2];
1459:         for (j=0; j<nz; j++) {
1460:           idx        = ics[ajtmp[j]];
1461:           rtmp1[idx] = v1[j];
1462:           rtmp2[idx] = v2[j];
1463:           rtmp3[idx] = v3[j];
1464:           if (ajtmp[j] == r[row]) {
1465:             rtmp1[idx] += damping;
1466:           }
1467:           if (ajtmp[j] == r[row+1]) {
1468:             rtmp2[idx] += damping;
1469:           }
1470:           if (ajtmp[j] == r[row+2]) {
1471:             rtmp3[idx] += damping;
1472:           }
1473:         }
1474:         /* loop over all pivot row blocks above this row block */
1475:         prow = *bjtmp++ + shift;
1476:         while (prow < row) {
1477:           pc1 = rtmp1 + prow;
1478:           pc2 = rtmp2 + prow;
1479:           pc3 = rtmp3 + prow;
1480:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1481:             pv   = ba  + bd[prow];
1482:             pj   = nbj + bd[prow];
1483:             mul1 = *pc1 * *pv;
1484:             mul2 = *pc2 * *pv;
1485:             mul3 = *pc3 * *pv;
1486:             ++pv;
1487:             *pc1 = mul1;
1488:             *pc2 = mul2;
1489:             *pc3 = mul3;
1490: 
1491:             nz   = bi[prow+1] - bd[prow] - 1;
1492:             PetscLogFlops(3*2*nz);
1493:             /* update this row based on pivot row */
1494:             for (j=0; j<nz; j++) {
1495:               tmp = pv[j];
1496:               idx = pj[j];
1497:               rtmps1[idx] -= mul1 * tmp;
1498:               rtmps2[idx] -= mul2 * tmp;
1499:               rtmps3[idx] -= mul3 * tmp;
1500:             }
1501:           }
1502:           prow = *bjtmp++ + shift;
1503:         }
1504:         /* Now take care of diagonal block in this set of rows */
1505:         pc1 = rtmp1 + prow;
1506:         pc2 = rtmp2 + prow;
1507:         pc3 = rtmp3 + prow;
1508:         if (*pc2 != 0.0 || *pc3 != 0.0){
1509:           pj   = nbj + bd[prow];
1510:           if (PetscAbsScalar(*pc1) < zeropivot) {
1511:             if (b->lu_damping) {
1512:               damp = PETSC_TRUE;
1513:               if (damping) damping *= 2.0;
1514:               else damping = 1.e-12;
1515:               ndamp++;
1516:               goto endofwhile;
1517:             } else {
1518:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1519:             }
1520:           }
1521:           mul2 = (*pc2)/(*pc1);
1522:           mul3 = (*pc3)/(*pc1);
1523:           *pc2 = mul2;
1524:           *pc3 = mul3;
1525:           nz   = bi[prow+1] - bd[prow] - 1;
1526:           PetscLogFlops(2*2*nz);
1527:           for (j=0; j<nz; j++) {
1528:             idx = pj[j] + shift;
1529:             tmp = rtmp1[idx];
1530:             rtmp2[idx] -= mul2 * tmp;
1531:             rtmp3[idx] -= mul3 * tmp;
1532:           }
1533:         }
1534:         ++prow;
1535:         pc2 = rtmp2 + prow;
1536:         pc3 = rtmp3 + prow;
1537:         if (*pc3 != 0.0){
1538:           pj   = nbj + bd[prow];
1539:           pj   = nbj + bd[prow];
1540:           if (PetscAbsScalar(*pc2) < zeropivot) {
1541:             if (b->lu_damping) {
1542:               damp = PETSC_TRUE;
1543:               if (damping) damping *= 2.0;
1544:               else damping = 1.e-12;
1545:               ndamp++;
1546:               goto endofwhile;
1547:             } else {
1548:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1549:             }
1550:           }
1551:           mul3 = (*pc3)/(*pc2);
1552:           *pc3 = mul3;
1553:           nz   = bi[prow+1] - bd[prow] - 1;
1554:           PetscLogFlops(2*2*nz);
1555:           for (j=0; j<nz; j++) {
1556:             idx = pj[j] + shift;
1557:             tmp = rtmp2[idx];
1558:             rtmp3[idx] -= mul3 * tmp;
1559:           }
1560:         }
1561:         nz  = bi[row+1] - bi[row];
1562:         pj  = bj + bi[row];
1563:         pc1 = ba + bi[row];
1564:         pc2 = ba + bi[row+1];
1565:         pc3 = ba + bi[row+2];
1566:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1567:           if (b->lu_damping) {
1568:             damp = PETSC_TRUE;
1569:             if (damping) damping *= 2.0;
1570:             else damping = 1.e-12;
1571:             ndamp++;
1572:             goto endofwhile;
1573:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1574:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1575:           } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1576:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1577:         } else {
1578:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1579:           }
1580:         }
1581:         rtmp1[row]   = 1.0/rtmp1[row];
1582:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1583:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1584:         /* copy row entries from dense representation to sparse */
1585:         for (j=0; j<nz; j++) {
1586:           idx    = pj[j];
1587:           pc1[j] = rtmps1[idx];
1588:           pc2[j] = rtmps2[idx];
1589:           pc3[j] = rtmps3[idx];
1590:         }
1591:         break;

1593:       default:
1594:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1595:       }
1596:       row += nsz;                 /* Update the row */
1597:     }
1598:     endofwhile:;
1599:   } while (damp);
1600:   PetscFree(rtmp1);
1601:   PetscFree(tmp_vec2);
1602:   ISRestoreIndices(isicol,&ic);
1603:   ISRestoreIndices(isrow,&r);
1604:   ISRestoreIndices(iscol,&c);
1605:   C->factor      = FACTOR_LU;
1606:   C->assembled   = PETSC_TRUE;
1607:   if (ndamp || b->lu_damping) {
1608:     PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %gn",ndamp,damping);
1609:   }
1610:   PetscLogFlops(C->n);
1611:   return(0);
1612: }

1614: /*
1615:      This is really ugly. if inodes are used this replaces the 
1616:   permutations with ones that correspond to rows/cols of the matrix
1617:   rather then inode blocks
1618: */
1619: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1620: {
1621:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1622:   int        ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1623:   int        row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1624:   int        nslim_col,*ns_col;
1625:   IS         ris = *rperm,cis = *cperm;
1626:   PetscTruth flg;

1629:   PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1630:   if (!flg) return(0);

1632:   if (!a->inode.size) return(0); /* no inodes so return */
1633:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1635:   ierr  = Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1636:   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1637:   ierr  = PetscMalloc((m+n+1)*sizeof(int),&permr);
1638:   permc = permr + m;

1640:   ierr  = ISGetIndices(ris,&ridx);
1641:   ierr  = ISGetIndices(cis,&cidx);

1643:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1644:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1646:   /* Construct the permutations for rows*/
1647:   for (i=0,row = 0; i<nslim_row; ++i){
1648:     indx      = ridx[i];
1649:     start_val = tns[indx];
1650:     end_val   = tns[indx + 1];
1651:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1652:   }

1654:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1655:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1657:  /*Construct permutations for columns*/
1658:   for (i=0,col=0; i<nslim_col; ++i){
1659:     indx      = cidx[i];
1660:     start_val = tns[indx];
1661:     end_val   = tns[indx + 1];
1662:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1663:   }

1665:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1666:   ISSetPermutation(*rperm);
1667:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1668:   ISSetPermutation(*cperm);
1669: 
1670:   ierr  = ISRestoreIndices(ris,&ridx);
1671:   ierr  = ISRestoreIndices(cis,&cidx);

1673:   PetscFree(ns_col);
1674:   PetscFree(permr);
1675:   ISDestroy(cis);
1676:   ISDestroy(ris);
1677:   PetscFree(tns);
1678:   return(0);
1679: }



1683: /*@C
1684:    MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.

1686:    Collective on Mat

1688:    Input Parameter:
1689: .  A - the SeqAIJ matrix.

1691:    Output Parameter:
1692: +  node_count - no of inodes present in the matrix.
1693: .  sizes      - an array of size node_count,with sizes of each inode.
1694: -  limit      - the max size used to generate the inodes.

1696:    Level: advanced

1698:    Notes: This routine returns some internal storage information
1699:    of the matrix, it is intended to be used by advanced users.
1700:    It should be called after the matrix is assembled.
1701:    The contents of the sizes[] array should not be changed.

1703: .keywords: matrix, seqaij, get, inode

1705: .seealso: MatGetInfo()
1706: @*/
1707: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1708: {
1709:   Mat_SeqAIJ *a;
1710:   PetscTruth flg;
1711:   int        ierr;

1715:   PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1716:   if (!flg) SETERRQ(PETSC_ERR_ARG_WRONG,"MatSeqAIJ only");
1717:   a           = (Mat_SeqAIJ*)A->data;
1718:   *node_count = a->inode.node_count;
1719:   *sizes      = a->inode.size;
1720:   *limit      = a->inode.limit;
1721:   return(0);
1722: }