Actual source code: aijnode.c
1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
2: /*
3: This file provides high performance routines for the AIJ (compressed row)
4: format by taking advantage of rows with identical nonzero structure (I-nodes).
5: */
6: #include src/mat/impls/aij/seq/aij.h
7: #include src/vec/vecimpl.h
9: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
10: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
11: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);
13: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
14: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
15: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
16: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
17: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
18: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
19: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
20: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
22: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
23: {
24: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
25: int ierr,i,count,m,n,min_mn,*ns_row,*ns_col;
28: n = A->n;
29: m = A->m;
30: ns_row = a->inode.size;
31:
32: min_mn = (m < n) ? m : n;
33: if (!ns) {
34: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
35: for(; count+1 < n; count++,i++);
36: if (count < n) {
37: i++;
38: }
39: *size = i;
40: return(0);
41: }
42: PetscMalloc((n+1)*sizeof(int),&ns_col);
43:
44: /* Use the same row structure wherever feasible. */
45: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
46: ns_col[i] = ns_row[i];
47: }
49: /* if m < n; pad up the remainder with inode_limit */
50: for(; count+1 < n; count++,i++) {
51: ns_col[i] = 1;
52: }
53: /* The last node is the odd ball. padd it up with the remaining rows; */
54: if (count < n) {
55: ns_col[i] = n - count;
56: i++;
57: } else if (count > n) {
58: /* Adjust for the over estimation */
59: ns_col[i-1] += n - count;
60: }
61: *size = i;
62: *ns = ns_col;
63: return(0);
64: }
67: /*
68: This builds symmetric version of nonzero structure,
69: */
70: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
71: {
72: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
73: int *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
74: int *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
77: nslim_row = a->inode.node_count;
78: m = A->m;
79: n = A->n;
80: if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
81:
82: /* Use the row_inode as column_inode */
83: nslim_col = nslim_row;
84: ns_col = ns_row;
86: /* allocate space for reformated inode structure */
87: PetscMalloc((nslim_col+1)*sizeof(int),&tns);
88: PetscMalloc((n+1)*sizeof(int),&tvc);
89: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
91: for (i1=0,col=0; i1<nslim_col; ++i1){
92: nsz = ns_col[i1];
93: for (i2=0; i2<nsz; ++i2,++col)
94: tvc[col] = i1;
95: }
96: /* allocate space for row pointers */
97: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
98: *iia = ia;
99: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
100: PetscMalloc((nslim_row+1)*sizeof(int),&work);
102: /* determine the number of columns in each row */
103: ia[0] = oshift;
104: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
106: j = aj + ai[row] + ishift;
107: jmax = aj + ai[row+1] + ishift;
108: i2 = 0;
109: col = *j++ + ishift;
110: i2 = tvc[col];
111: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
112: ia[i1+1]++;
113: ia[i2+1]++;
114: i2++; /* Start col of next node */
115: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
116: i2 = tvc[col];
117: }
118: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
119: }
121: /* shift ia[i] to point to next row */
122: for (i1=1; i1<nslim_row+1; i1++) {
123: row = ia[i1-1];
124: ia[i1] += row;
125: work[i1-1] = row - oshift;
126: }
128: /* allocate space for column pointers */
129: nz = ia[nslim_row] + (!ishift);
130: PetscMalloc(nz*sizeof(int),&ja);
131: *jja = ja;
133: /* loop over lower triangular part putting into ja */
134: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
135: j = aj + ai[row] + ishift;
136: jmax = aj + ai[row+1] + ishift;
137: i2 = 0; /* Col inode index */
138: col = *j++ + ishift;
139: i2 = tvc[col];
140: while (i2<i1 && j<jmax) {
141: ja[work[i2]++] = i1 + oshift;
142: ja[work[i1]++] = i2 + oshift;
143: ++i2;
144: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
145: i2 = tvc[col];
146: }
147: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
149: }
150: PetscFree(work);
151: PetscFree(tns);
152: PetscFree(tvc);
153: return(0);
154: }
156: /*
157: This builds nonsymmetric version of nonzero structure,
158: */
159: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
160: {
161: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
162: int *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
163: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
166: nslim_row = a->inode.node_count;
167: n = A->n;
169: /* Create The column_inode for this matrix */
170: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
171:
172: /* allocate space for reformated column_inode structure */
173: PetscMalloc((nslim_col +1)*sizeof(int),&tns);
174: PetscMalloc((n +1)*sizeof(int),&tvc);
175: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
177: for (i1=0,col=0; i1<nslim_col; ++i1){
178: nsz = ns_col[i1];
179: for (i2=0; i2<nsz; ++i2,++col)
180: tvc[col] = i1;
181: }
182: /* allocate space for row pointers */
183: PetscMalloc((nslim_row+1)*sizeof(int),&ia);
184: *iia = ia;
185: PetscMemzero(ia,(nslim_row+1)*sizeof(int));
186: PetscMalloc((nslim_row+1)*sizeof(int),&work);
188: /* determine the number of columns in each row */
189: ia[0] = oshift;
190: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
191: j = aj + ai[row] + ishift;
192: col = *j++ + ishift;
193: i2 = tvc[col];
194: nz = ai[row+1] - ai[row];
195: while (nz-- > 0) { /* off-diagonal elemets */
196: ia[i1+1]++;
197: i2++; /* Start col of next node */
198: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
199: i2 = tvc[col];
200: }
201: }
203: /* shift ia[i] to point to next row */
204: for (i1=1; i1<nslim_row+1; i1++) {
205: row = ia[i1-1];
206: ia[i1] += row;
207: work[i1-1] = row - oshift;
208: }
210: /* allocate space for column pointers */
211: nz = ia[nslim_row] + (!ishift);
212: PetscMalloc(nz*sizeof(int),&ja);
213: *jja = ja;
215: /* loop over matrix putting into ja */
216: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
217: j = aj + ai[row] + ishift;
218: i2 = 0; /* Col inode index */
219: col = *j++ + ishift;
220: i2 = tvc[col];
221: nz = ai[row+1] - ai[row];
222: while (nz-- > 0) {
223: ja[work[i1]++] = i2 + oshift;
224: ++i2;
225: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
226: i2 = tvc[col];
227: }
228: }
229: PetscFree(ns_col);
230: PetscFree(work);
231: PetscFree(tns);
232: PetscFree(tvc);
233: return(0);
234: }
236: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
237: {
238: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
239: int ierr,ishift;
242: *n = a->inode.node_count;
243: if (!ia) return(0);
245: ishift = a->indexshift;
246: if (symmetric) {
247: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
248: } else {
249: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
250: }
251: return(0);
252: }
254: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
255: {
259: if (!ia) return(0);
260: PetscFree(*ia);
261: PetscFree(*ja);
262: return(0);
263: }
265: /* ----------------------------------------------------------- */
267: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
268: {
269: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
270: int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
271: int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
274: nslim_row = a->inode.node_count;
275: n = A->n;
277: /* Create The column_inode for this matrix */
278: Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
279:
280: /* allocate space for reformated column_inode structure */
281: PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
282: PetscMalloc((n + 1)*sizeof(int),&tvc);
283: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
285: for (i1=0,col=0; i1<nslim_col; ++i1){
286: nsz = ns_col[i1];
287: for (i2=0; i2<nsz; ++i2,++col)
288: tvc[col] = i1;
289: }
290: /* allocate space for column pointers */
291: PetscMalloc((nslim_col+1)*sizeof(int),&ia);
292: *iia = ia;
293: PetscMemzero(ia,(nslim_col+1)*sizeof(int));
294: PetscMalloc((nslim_col+1)*sizeof(int),&work);
296: /* determine the number of columns in each row */
297: ia[0] = oshift;
298: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
299: j = aj + ai[row] + ishift;
300: col = *j++ + ishift;
301: i2 = tvc[col];
302: nz = ai[row+1] - ai[row];
303: while (nz-- > 0) { /* off-diagonal elemets */
304: /* ia[i1+1]++; */
305: ia[i2+1]++;
306: i2++;
307: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
308: i2 = tvc[col];
309: }
310: }
312: /* shift ia[i] to point to next col */
313: for (i1=1; i1<nslim_col+1; i1++) {
314: col = ia[i1-1];
315: ia[i1] += col;
316: work[i1-1] = col - oshift;
317: }
319: /* allocate space for column pointers */
320: nz = ia[nslim_col] + (!ishift);
321: PetscMalloc(nz*sizeof(int),&ja);
322: *jja = ja;
324: /* loop over matrix putting into ja */
325: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
326: j = aj + ai[row] + ishift;
327: i2 = 0; /* Col inode index */
328: col = *j++ + ishift;
329: i2 = tvc[col];
330: nz = ai[row+1] - ai[row];
331: while (nz-- > 0) {
332: /* ja[work[i1]++] = i2 + oshift; */
333: ja[work[i2]++] = i1 + oshift;
334: i2++;
335: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
336: i2 = tvc[col];
337: }
338: }
339: PetscFree(ns_col);
340: PetscFree(work);
341: PetscFree(tns);
342: PetscFree(tvc);
343: return(0);
344: }
346: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
347: {
348: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
349: int ierr,ishift;
352: Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
353: if (!ia) return(0);
355: ishift = a->indexshift;
356: if (symmetric) {
357: /* Since the indices are symmetric it does'nt matter */
358: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
359: } else {
360: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
361: }
362: return(0);
363: }
365: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
366: {
370: if (!ia) return(0);
371: PetscFree(*ia);
372: PetscFree(*ja);
373: return(0);
374: }
376: /* ----------------------------------------------------------- */
378: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
379: {
380: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
381: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
382: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
383: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
384: int shift = a->indexshift;
385:
386: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
387: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
388: #endif
391: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
392: node_max = a->inode.node_count;
393: ns = a->inode.size; /* Node Size array */
394: VecGetArray(xx,&x);
395: VecGetArray(yy,&y);
396: x = x + shift; /* shift for Fortran start by 1 indexing */
397: idx = a->j;
398: v1 = a->a;
399: ii = a->i;
401: for (i = 0,row = 0; i< node_max; ++i){
402: nsz = ns[i];
403: n = ii[1] - ii[0];
404: ii += nsz;
405: sz = n; /*No of non zeros in this row */
406: /* Switch on the size of Node */
407: switch (nsz){ /* Each loop in 'case' is unrolled */
408: case 1 :
409: sum1 = 0;
410:
411: for(n = 0; n< sz-1; n+=2) {
412: i1 = idx[0]; /* The instructions are ordered to */
413: i2 = idx[1]; /* make the compiler's job easy */
414: idx += 2;
415: tmp0 = x[i1];
416: tmp1 = x[i2];
417: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
418: }
419:
420: if (n == sz-1){ /* Take care of the last nonzero */
421: tmp0 = x[*idx++];
422: sum1 += *v1++ * tmp0;
423: }
424: y[row++]=sum1;
425: break;
426: case 2:
427: sum1 = 0;
428: sum2 = 0;
429: v2 = v1 + n;
430:
431: for (n = 0; n< sz-1; n+=2) {
432: i1 = idx[0];
433: i2 = idx[1];
434: idx += 2;
435: tmp0 = x[i1];
436: tmp1 = x[i2];
437: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
438: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
439: }
440: if (n == sz-1){
441: tmp0 = x[*idx++];
442: sum1 += *v1++ * tmp0;
443: sum2 += *v2++ * tmp0;
444: }
445: y[row++]=sum1;
446: y[row++]=sum2;
447: v1 =v2; /* Since the next block to be processed starts there*/
448: idx +=sz;
449: break;
450: case 3:
451: sum1 = 0;
452: sum2 = 0;
453: sum3 = 0;
454: v2 = v1 + n;
455: v3 = v2 + n;
456:
457: for (n = 0; n< sz-1; n+=2) {
458: i1 = idx[0];
459: i2 = idx[1];
460: idx += 2;
461: tmp0 = x[i1];
462: tmp1 = x[i2];
463: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
464: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
465: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
466: }
467: if (n == sz-1){
468: tmp0 = x[*idx++];
469: sum1 += *v1++ * tmp0;
470: sum2 += *v2++ * tmp0;
471: sum3 += *v3++ * tmp0;
472: }
473: y[row++]=sum1;
474: y[row++]=sum2;
475: y[row++]=sum3;
476: v1 =v3; /* Since the next block to be processed starts there*/
477: idx +=2*sz;
478: break;
479: case 4:
480: sum1 = 0;
481: sum2 = 0;
482: sum3 = 0;
483: sum4 = 0;
484: v2 = v1 + n;
485: v3 = v2 + n;
486: v4 = v3 + n;
487:
488: for (n = 0; n< sz-1; n+=2) {
489: i1 = idx[0];
490: i2 = idx[1];
491: idx += 2;
492: tmp0 = x[i1];
493: tmp1 = x[i2];
494: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
495: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
496: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
497: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
498: }
499: if (n == sz-1){
500: tmp0 = x[*idx++];
501: sum1 += *v1++ * tmp0;
502: sum2 += *v2++ * tmp0;
503: sum3 += *v3++ * tmp0;
504: sum4 += *v4++ * tmp0;
505: }
506: y[row++]=sum1;
507: y[row++]=sum2;
508: y[row++]=sum3;
509: y[row++]=sum4;
510: v1 =v4; /* Since the next block to be processed starts there*/
511: idx +=3*sz;
512: break;
513: case 5:
514: sum1 = 0;
515: sum2 = 0;
516: sum3 = 0;
517: sum4 = 0;
518: sum5 = 0;
519: v2 = v1 + n;
520: v3 = v2 + n;
521: v4 = v3 + n;
522: v5 = v4 + n;
523:
524: for (n = 0; n<sz-1; n+=2) {
525: i1 = idx[0];
526: i2 = idx[1];
527: idx += 2;
528: tmp0 = x[i1];
529: tmp1 = x[i2];
530: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
531: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
532: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
533: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
534: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
535: }
536: if (n == sz-1){
537: tmp0 = x[*idx++];
538: sum1 += *v1++ * tmp0;
539: sum2 += *v2++ * tmp0;
540: sum3 += *v3++ * tmp0;
541: sum4 += *v4++ * tmp0;
542: sum5 += *v5++ * tmp0;
543: }
544: y[row++]=sum1;
545: y[row++]=sum2;
546: y[row++]=sum3;
547: y[row++]=sum4;
548: y[row++]=sum5;
549: v1 =v5; /* Since the next block to be processed starts there */
550: idx +=4*sz;
551: break;
552: default :
553: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
554: }
555: }
556: VecRestoreArray(xx,&x);
557: VecRestoreArray(yy,&y);
558: PetscLogFlops(2*a->nz - A->m);
559: return(0);
560: }
561: /* ----------------------------------------------------------- */
562: /* Almost same code as the MatMult_SeqAij_Inode() */
563: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
564: {
565: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
566: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
567: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
568: int ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
569: int shift = a->indexshift;
570:
572: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
573: node_max = a->inode.node_count;
574: ns = a->inode.size; /* Node Size array */
575: VecGetArray(xx,&x);
576: VecGetArray(yy,&y);
577: if (zz != yy) {
578: VecGetArray(zz,&z);
579: } else {
580: z = y;
581: }
582: zt = z;
584: x = x + shift; /* shift for Fortran start by 1 indexing */
585: idx = a->j;
586: v1 = a->a;
587: ii = a->i;
589: for (i = 0,row = 0; i< node_max; ++i){
590: nsz = ns[i];
591: n = ii[1] - ii[0];
592: ii += nsz;
593: sz = n; /*No of non zeros in this row */
594: /* Switch on the size of Node */
595: switch (nsz){ /* Each loop in 'case' is unrolled */
596: case 1 :
597: sum1 = *zt++;
598:
599: for(n = 0; n< sz-1; n+=2) {
600: i1 = idx[0]; /* The instructions are ordered to */
601: i2 = idx[1]; /* make the compiler's job easy */
602: idx += 2;
603: tmp0 = x[i1];
604: tmp1 = x[i2];
605: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
606: }
607:
608: if(n == sz-1){ /* Take care of the last nonzero */
609: tmp0 = x[*idx++];
610: sum1 += *v1++ * tmp0;
611: }
612: y[row++]=sum1;
613: break;
614: case 2:
615: sum1 = *zt++;
616: sum2 = *zt++;
617: v2 = v1 + n;
618:
619: for(n = 0; n< sz-1; n+=2) {
620: i1 = idx[0];
621: i2 = idx[1];
622: idx += 2;
623: tmp0 = x[i1];
624: tmp1 = x[i2];
625: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
626: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
627: }
628: if(n == sz-1){
629: tmp0 = x[*idx++];
630: sum1 += *v1++ * tmp0;
631: sum2 += *v2++ * tmp0;
632: }
633: y[row++]=sum1;
634: y[row++]=sum2;
635: v1 =v2; /* Since the next block to be processed starts there*/
636: idx +=sz;
637: break;
638: case 3:
639: sum1 = *zt++;
640: sum2 = *zt++;
641: sum3 = *zt++;
642: v2 = v1 + n;
643: v3 = v2 + n;
644:
645: for (n = 0; n< sz-1; n+=2) {
646: i1 = idx[0];
647: i2 = idx[1];
648: idx += 2;
649: tmp0 = x[i1];
650: tmp1 = x[i2];
651: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
652: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
653: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
654: }
655: if (n == sz-1){
656: tmp0 = x[*idx++];
657: sum1 += *v1++ * tmp0;
658: sum2 += *v2++ * tmp0;
659: sum3 += *v3++ * tmp0;
660: }
661: y[row++]=sum1;
662: y[row++]=sum2;
663: y[row++]=sum3;
664: v1 =v3; /* Since the next block to be processed starts there*/
665: idx +=2*sz;
666: break;
667: case 4:
668: sum1 = *zt++;
669: sum2 = *zt++;
670: sum3 = *zt++;
671: sum4 = *zt++;
672: v2 = v1 + n;
673: v3 = v2 + n;
674: v4 = v3 + n;
675:
676: for (n = 0; n< sz-1; n+=2) {
677: i1 = idx[0];
678: i2 = idx[1];
679: idx += 2;
680: tmp0 = x[i1];
681: tmp1 = x[i2];
682: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
683: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
684: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
685: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
686: }
687: if (n == sz-1){
688: tmp0 = x[*idx++];
689: sum1 += *v1++ * tmp0;
690: sum2 += *v2++ * tmp0;
691: sum3 += *v3++ * tmp0;
692: sum4 += *v4++ * tmp0;
693: }
694: y[row++]=sum1;
695: y[row++]=sum2;
696: y[row++]=sum3;
697: y[row++]=sum4;
698: v1 =v4; /* Since the next block to be processed starts there*/
699: idx +=3*sz;
700: break;
701: case 5:
702: sum1 = *zt++;
703: sum2 = *zt++;
704: sum3 = *zt++;
705: sum4 = *zt++;
706: sum5 = *zt++;
707: v2 = v1 + n;
708: v3 = v2 + n;
709: v4 = v3 + n;
710: v5 = v4 + n;
711:
712: for (n = 0; n<sz-1; n+=2) {
713: i1 = idx[0];
714: i2 = idx[1];
715: idx += 2;
716: tmp0 = x[i1];
717: tmp1 = x[i2];
718: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
719: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
720: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
721: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
722: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
723: }
724: if(n == sz-1){
725: tmp0 = x[*idx++];
726: sum1 += *v1++ * tmp0;
727: sum2 += *v2++ * tmp0;
728: sum3 += *v3++ * tmp0;
729: sum4 += *v4++ * tmp0;
730: sum5 += *v5++ * tmp0;
731: }
732: y[row++]=sum1;
733: y[row++]=sum2;
734: y[row++]=sum3;
735: y[row++]=sum4;
736: y[row++]=sum5;
737: v1 =v5; /* Since the next block to be processed starts there */
738: idx +=4*sz;
739: break;
740: default :
741: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
742: }
743: }
744: VecRestoreArray(xx,&x);
745: VecRestoreArray(yy,&y);
746: if (zz != yy) {
747: VecRestoreArray(zz,&z);
748: }
749: PetscLogFlops(2*a->nz);
750: return(0);
751: }
752: /* ----------------------------------------------------------- */
753: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,int *,ISColoring *);
755: /*
756: samestructure indicates that the matrix has not changed its nonzero structure so we
757: do not need to recompute the inodes
758: */
759: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
760: {
761: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
762: int ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
763: PetscTruth flag,flg;
766: if (samestructure && a->inode.checked) return(0);
768: a->inode.checked = PETSC_TRUE;
770: /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
771: if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODESn"); return(0);}
772: PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
773: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inoden");return(0);}
774: PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
775: if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unrolln");return(0);}
776: PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
777: if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
778: m = A->m;
779: if (a->inode.size) {ns = a->inode.size;}
780: else {PetscMalloc((m+1)*sizeof(int),&ns);}
782: i = 0;
783: node_count = 0;
784: idx = a->j;
785: ii = a->i;
786: while (i < m){ /* For each row */
787: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
788: /* Limits the number of elements in a node to 'a->inode.limit' */
789: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
790: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
791: if (nzy != nzx) break;
792: idy += nzx; /* Same nonzero pattern */
793: PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
794: if (flag == PETSC_FALSE) break;
795: }
796: ns[node_count++] = blk_size;
797: idx += blk_size*nzx;
798: i = j;
799: }
800: /* If not enough inodes found,, do not use inode version of the routines */
801: if (!a->inode.size && m && node_count > 0.9*m) {
802: PetscFree(ns);
803: A->ops->mult = MatMult_SeqAIJ;
804: A->ops->multadd = MatMultAdd_SeqAIJ;
805: A->ops->solve = MatSolve_SeqAIJ;
806: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
807: A->ops->getrowij = MatGetRowIJ_SeqAIJ;
808: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
809: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
810: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
811: A->ops->coloringpatch = 0;
812: a->inode.node_count = 0;
813: a->inode.size = 0;
814: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routinesn",node_count,m);
815: } else {
816: A->ops->mult = MatMult_SeqAIJ_Inode;
817: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
818: A->ops->solve = MatSolve_SeqAIJ_Inode;
819: A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
820: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
821: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
822: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
823: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
824: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
825: a->inode.node_count = node_count;
826: a->inode.size = ns;
827: PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routinesn",node_count,m,a->inode.limit);
828: }
829: return(0);
830: }
832: /* ----------------------------------------------------------- */
833: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
834: {
835: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
836: IS iscol = a->col,isrow = a->row;
837: int *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,shift = a->indexshift,*a_j = a->j;
838: int node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
839: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
840: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
843: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
844: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
845: node_max = a->inode.node_count;
846: ns = a->inode.size; /* Node Size array */
848: VecGetArray(bb,&b);
849: VecGetArray(xx,&x);
850: tmp = a->solve_work;
851:
852: ISGetIndices(isrow,&rout); r = rout;
853: ISGetIndices(iscol,&cout); c = cout + (n-1);
854:
855: /* forward solve the lower triangular */
856: tmps = tmp + shift;
857: aa = a_a +shift;
858: aj = a_j + shift;
859: ad = a->diag;
861: for (i = 0,row = 0; i< node_max; ++i){
862: nsz = ns[i];
863: aii = ai[row];
864: v1 = aa + aii;
865: vi = aj + aii;
866: nz = ad[row]- aii;
867:
868: switch (nsz){ /* Each loop in 'case' is unrolled */
869: case 1 :
870: sum1 = b[*r++];
871: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
872: for(j=0; j<nz-1; j+=2){
873: i0 = vi[0];
874: i1 = vi[1];
875: vi +=2;
876: tmp0 = tmps[i0];
877: tmp1 = tmps[i1];
878: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879: }
880: if(j == nz-1){
881: tmp0 = tmps[*vi++];
882: sum1 -= *v1++ *tmp0;
883: }
884: tmp[row ++]=sum1;
885: break;
886: case 2:
887: sum1 = b[*r++];
888: sum2 = b[*r++];
889: v2 = aa + ai[row+1];
891: for(j=0; j<nz-1; j+=2){
892: i0 = vi[0];
893: i1 = vi[1];
894: vi +=2;
895: tmp0 = tmps[i0];
896: tmp1 = tmps[i1];
897: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
898: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
899: }
900: if(j == nz-1){
901: tmp0 = tmps[*vi++];
902: sum1 -= *v1++ *tmp0;
903: sum2 -= *v2++ *tmp0;
904: }
905: sum2 -= *v2++ * sum1;
906: tmp[row ++]=sum1;
907: tmp[row ++]=sum2;
908: break;
909: case 3:
910: sum1 = b[*r++];
911: sum2 = b[*r++];
912: sum3 = b[*r++];
913: v2 = aa + ai[row+1];
914: v3 = aa + ai[row+2];
915:
916: for (j=0; j<nz-1; j+=2){
917: i0 = vi[0];
918: i1 = vi[1];
919: vi +=2;
920: tmp0 = tmps[i0];
921: tmp1 = tmps[i1];
922: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
923: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
924: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
925: }
926: if (j == nz-1){
927: tmp0 = tmps[*vi++];
928: sum1 -= *v1++ *tmp0;
929: sum2 -= *v2++ *tmp0;
930: sum3 -= *v3++ *tmp0;
931: }
932: sum2 -= *v2++ * sum1;
933: sum3 -= *v3++ * sum1;
934: sum3 -= *v3++ * sum2;
935: tmp[row ++]=sum1;
936: tmp[row ++]=sum2;
937: tmp[row ++]=sum3;
938: break;
939:
940: case 4:
941: sum1 = b[*r++];
942: sum2 = b[*r++];
943: sum3 = b[*r++];
944: sum4 = b[*r++];
945: v2 = aa + ai[row+1];
946: v3 = aa + ai[row+2];
947: v4 = aa + ai[row+3];
948:
949: for (j=0; j<nz-1; j+=2){
950: i0 = vi[0];
951: i1 = vi[1];
952: vi +=2;
953: tmp0 = tmps[i0];
954: tmp1 = tmps[i1];
955: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
956: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
957: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
958: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
959: }
960: if (j == nz-1){
961: tmp0 = tmps[*vi++];
962: sum1 -= *v1++ *tmp0;
963: sum2 -= *v2++ *tmp0;
964: sum3 -= *v3++ *tmp0;
965: sum4 -= *v4++ *tmp0;
966: }
967: sum2 -= *v2++ * sum1;
968: sum3 -= *v3++ * sum1;
969: sum4 -= *v4++ * sum1;
970: sum3 -= *v3++ * sum2;
971: sum4 -= *v4++ * sum2;
972: sum4 -= *v4++ * sum3;
973:
974: tmp[row ++]=sum1;
975: tmp[row ++]=sum2;
976: tmp[row ++]=sum3;
977: tmp[row ++]=sum4;
978: break;
979: case 5:
980: sum1 = b[*r++];
981: sum2 = b[*r++];
982: sum3 = b[*r++];
983: sum4 = b[*r++];
984: sum5 = b[*r++];
985: v2 = aa + ai[row+1];
986: v3 = aa + ai[row+2];
987: v4 = aa + ai[row+3];
988: v5 = aa + ai[row+4];
989:
990: for (j=0; j<nz-1; j+=2){
991: i0 = vi[0];
992: i1 = vi[1];
993: vi +=2;
994: tmp0 = tmps[i0];
995: tmp1 = tmps[i1];
996: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
997: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
998: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
999: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1000: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1001: }
1002: if (j == nz-1){
1003: tmp0 = tmps[*vi++];
1004: sum1 -= *v1++ *tmp0;
1005: sum2 -= *v2++ *tmp0;
1006: sum3 -= *v3++ *tmp0;
1007: sum4 -= *v4++ *tmp0;
1008: sum5 -= *v5++ *tmp0;
1009: }
1011: sum2 -= *v2++ * sum1;
1012: sum3 -= *v3++ * sum1;
1013: sum4 -= *v4++ * sum1;
1014: sum5 -= *v5++ * sum1;
1015: sum3 -= *v3++ * sum2;
1016: sum4 -= *v4++ * sum2;
1017: sum5 -= *v5++ * sum2;
1018: sum4 -= *v4++ * sum3;
1019: sum5 -= *v5++ * sum3;
1020: sum5 -= *v5++ * sum4;
1021:
1022: tmp[row ++]=sum1;
1023: tmp[row ++]=sum2;
1024: tmp[row ++]=sum3;
1025: tmp[row ++]=sum4;
1026: tmp[row ++]=sum5;
1027: break;
1028: default:
1029: SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1030: }
1031: }
1032: /* backward solve the upper triangular */
1033: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1034: nsz = ns[i];
1035: aii = ai[row+1] -1;
1036: v1 = aa + aii;
1037: vi = aj + aii;
1038: nz = aii- ad[row];
1039: switch (nsz){ /* Each loop in 'case' is unrolled */
1040: case 1 :
1041: sum1 = tmp[row];
1043: for(j=nz ; j>1; j-=2){
1044: i0 = vi[0];
1045: i1 = vi[-1];
1046: vi -=2;
1047: tmp0 = tmps[i0];
1048: tmp1 = tmps[i1];
1049: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1050: }
1051: if (j==1){
1052: tmp0 = tmps[*vi--];
1053: sum1 -= *v1-- * tmp0;
1054: }
1055: x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1056: break;
1057: case 2 :
1058: sum1 = tmp[row];
1059: sum2 = tmp[row -1];
1060: v2 = aa + ai[row]-1;
1061: for (j=nz ; j>1; j-=2){
1062: i0 = vi[0];
1063: i1 = vi[-1];
1064: vi -=2;
1065: tmp0 = tmps[i0];
1066: tmp1 = tmps[i1];
1067: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1068: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1069: }
1070: if (j==1){
1071: tmp0 = tmps[*vi--];
1072: sum1 -= *v1-- * tmp0;
1073: sum2 -= *v2-- * tmp0;
1074: }
1075:
1076: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1077: sum2 -= *v2-- * tmp0;
1078: x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1079: break;
1080: case 3 :
1081: sum1 = tmp[row];
1082: sum2 = tmp[row -1];
1083: sum3 = tmp[row -2];
1084: v2 = aa + ai[row]-1;
1085: v3 = aa + ai[row -1]-1;
1086: for (j=nz ; j>1; j-=2){
1087: i0 = vi[0];
1088: i1 = vi[-1];
1089: vi -=2;
1090: tmp0 = tmps[i0];
1091: tmp1 = tmps[i1];
1092: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1093: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1094: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1095: }
1096: if (j==1){
1097: tmp0 = tmps[*vi--];
1098: sum1 -= *v1-- * tmp0;
1099: sum2 -= *v2-- * tmp0;
1100: sum3 -= *v3-- * tmp0;
1101: }
1102: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1103: sum2 -= *v2-- * tmp0;
1104: sum3 -= *v3-- * tmp0;
1105: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1106: sum3 -= *v3-- * tmp0;
1107: x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1108:
1109: break;
1110: case 4 :
1111: sum1 = tmp[row];
1112: sum2 = tmp[row -1];
1113: sum3 = tmp[row -2];
1114: sum4 = tmp[row -3];
1115: v2 = aa + ai[row]-1;
1116: v3 = aa + ai[row -1]-1;
1117: v4 = aa + ai[row -2]-1;
1119: for (j=nz ; j>1; j-=2){
1120: i0 = vi[0];
1121: i1 = vi[-1];
1122: vi -=2;
1123: tmp0 = tmps[i0];
1124: tmp1 = tmps[i1];
1125: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1126: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1127: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1128: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1129: }
1130: if (j==1){
1131: tmp0 = tmps[*vi--];
1132: sum1 -= *v1-- * tmp0;
1133: sum2 -= *v2-- * tmp0;
1134: sum3 -= *v3-- * tmp0;
1135: sum4 -= *v4-- * tmp0;
1136: }
1138: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1139: sum2 -= *v2-- * tmp0;
1140: sum3 -= *v3-- * tmp0;
1141: sum4 -= *v4-- * tmp0;
1142: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1143: sum3 -= *v3-- * tmp0;
1144: sum4 -= *v4-- * tmp0;
1145: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1146: sum4 -= *v4-- * tmp0;
1147: x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1148: break;
1149: case 5 :
1150: sum1 = tmp[row];
1151: sum2 = tmp[row -1];
1152: sum3 = tmp[row -2];
1153: sum4 = tmp[row -3];
1154: sum5 = tmp[row -4];
1155: v2 = aa + ai[row]-1;
1156: v3 = aa + ai[row -1]-1;
1157: v4 = aa + ai[row -2]-1;
1158: v5 = aa + ai[row -3]-1;
1159: for (j=nz ; j>1; j-=2){
1160: i0 = vi[0];
1161: i1 = vi[-1];
1162: vi -=2;
1163: tmp0 = tmps[i0];
1164: tmp1 = tmps[i1];
1165: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1166: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1167: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1168: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1169: sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1170: }
1171: if (j==1){
1172: tmp0 = tmps[*vi--];
1173: sum1 -= *v1-- * tmp0;
1174: sum2 -= *v2-- * tmp0;
1175: sum3 -= *v3-- * tmp0;
1176: sum4 -= *v4-- * tmp0;
1177: sum5 -= *v5-- * tmp0;
1178: }
1180: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1181: sum2 -= *v2-- * tmp0;
1182: sum3 -= *v3-- * tmp0;
1183: sum4 -= *v4-- * tmp0;
1184: sum5 -= *v5-- * tmp0;
1185: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1186: sum3 -= *v3-- * tmp0;
1187: sum4 -= *v4-- * tmp0;
1188: sum5 -= *v5-- * tmp0;
1189: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1190: sum4 -= *v4-- * tmp0;
1191: sum5 -= *v5-- * tmp0;
1192: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1193: sum5 -= *v5-- * tmp0;
1194: x[*c--] = tmp[row] = sum5*a_a[ad[row]+shift]; row--;
1195: break;
1196: default:
1197: SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1198: }
1199: }
1200: ISRestoreIndices(isrow,&rout);
1201: ISRestoreIndices(iscol,&cout);
1202: VecRestoreArray(bb,&b);
1203: VecRestoreArray(xx,&x);
1204: PetscLogFlops(2*a->nz - A->n);
1205: return(0);
1206: }
1209: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1210: {
1211: Mat C = *B;
1212: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1213: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1214: int shift = a->indexshift,*r,*ic,*c,ierr,n = A->m,*bi = b->i;
1215: int *bj = b->j+shift,*nbj=b->j +(!shift),*ajtmp,*bjtmp,nz,row,prow;
1216: int *ics,i,j,idx,*ai = a->i,*aj = a->j+shift,*bd = b->diag,node_max,nsz;
1217: int *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1218: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1219: PetscScalar tmp,*ba = b->a+shift,*aa = a->a+shift,*pv,*rtmps1,*rtmps2,*rtmps3;
1220: PetscTruth damp;
1221: PetscReal damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1224: ierr = ISGetIndices(isrow,&r);
1225: ierr = ISGetIndices(iscol,&c);
1226: ierr = ISGetIndices(isicol,&ic);
1227: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1228: ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1229: ics = ic + shift; rtmps1 = rtmp1 + shift;
1230: rtmp2 = rtmp1 + n; rtmps2 = rtmp2 + shift;
1231: rtmp3 = rtmp2 + n; rtmps3 = rtmp3 + shift;
1232:
1233: node_max = a->inode.node_count;
1234: ns = a->inode.size ;
1235: if (!ns){
1236: SETERRQ(1,"Matrix without inode information");
1237: }
1239: /* If max inode size > 3, split it into two inodes.*/
1240: /* also map the inode sizes according to the ordering */
1241: PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1242: for (i=0,j=0; i<node_max; ++i,++j){
1243: if (ns[i]>3) {
1244: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1245: ++j;
1246: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247: } else {
1248: tmp_vec1[j] = ns[i];
1249: }
1250: }
1251: /* Use the correct node_max */
1252: node_max = j;
1254: /* Now reorder the inode info based on mat re-ordering info */
1255: /* First create a row -> inode_size_array_index map */
1256: PetscMalloc(n*sizeof(int)+1,&nsmap);
1257: PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1258: for (i=0,row=0; i<node_max; i++) {
1259: nsz = tmp_vec1[i];
1260: for (j=0; j<nsz; j++,row++) {
1261: nsmap[row] = i;
1262: }
1263: }
1264: /* Using nsmap, create a reordered ns structure */
1265: for (i=0,j=0; i< node_max; i++) {
1266: nsz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1267: tmp_vec2[i] = nsz;
1268: j += nsz;
1269: }
1270: PetscFree(nsmap);
1271: PetscFree(tmp_vec1);
1272: /* Now use the correct ns */
1273: ns = tmp_vec2;
1276: do {
1277: damp = PETSC_FALSE;
1278: /* Now loop over each block-row, and do the factorization */
1279: for (i=0,row=0; i<node_max; i++) {
1280: nsz = ns[i];
1281: nz = bi[row+1] - bi[row];
1282: bjtmp = bj + bi[row];
1283:
1284: switch (nsz){
1285: case 1:
1286: for (j=0; j<nz; j++){
1287: idx = bjtmp[j];
1288: rtmps1[idx] = 0.0;
1289: }
1290:
1291: /* load in initial (unfactored row) */
1292: idx = r[row];
1293: nz = ai[idx+1] - ai[idx];
1294: ajtmp = aj + ai[idx];
1295: v1 = aa + ai[idx];
1297: for (j=0; j<nz; j++) {
1298: idx = ics[ajtmp[j]];
1299: rtmp1[idx] = v1[j];
1300: if (ajtmp[j] == r[row]) {
1301: rtmp1[idx] += damping;
1302: }
1303: }
1304: prow = *bjtmp++ + shift;
1305: while (prow < row) {
1306: pc1 = rtmp1 + prow;
1307: if (*pc1 != 0.0){
1308: pv = ba + bd[prow];
1309: pj = nbj + bd[prow];
1310: mul1 = *pc1 * *pv++;
1311: *pc1 = mul1;
1312: nz = bi[prow+1] - bd[prow] - 1;
1313: PetscLogFlops(2*nz);
1314: for (j=0; j<nz; j++) {
1315: tmp = pv[j];
1316: idx = pj[j];
1317: rtmps1[idx] -= mul1 * tmp;
1318: }
1319: }
1320: prow = *bjtmp++ + shift;
1321: }
1322: nz = bi[row+1] - bi[row];
1323: pj = bj + bi[row];
1324: pc1 = ba + bi[row];
1325: if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1326: if (b->lu_damping) {
1327: damp = PETSC_TRUE;
1328: if (damping) damping *= 2.0;
1329: else damping = 1.e-12;
1330: ndamp++;
1331: goto endofwhile;
1332: } else {
1333: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1334: }
1335: }
1336: rtmp1[row] = 1.0/rtmp1[row];
1337: for (j=0; j<nz; j++) {
1338: idx = pj[j];
1339: pc1[j] = rtmps1[idx];
1340: }
1341: break;
1342:
1343: case 2:
1344: for (j=0; j<nz; j++) {
1345: idx = bjtmp[j];
1346: rtmps1[idx] = 0.0;
1347: rtmps2[idx] = 0.0;
1348: }
1349:
1350: /* load in initial (unfactored row) */
1351: idx = r[row];
1352: nz = ai[idx+1] - ai[idx];
1353: ajtmp = aj + ai[idx];
1354: v1 = aa + ai[idx];
1355: v2 = aa + ai[idx+1];
1356:
1357: for (j=0; j<nz; j++) {
1358: idx = ics[ajtmp[j]];
1359: rtmp1[idx] = v1[j];
1360: rtmp2[idx] = v2[j];
1361: if (ajtmp[j] == r[row]) {
1362: rtmp1[idx] += damping;
1363: }
1364: if (ajtmp[j] == r[row+1]) {
1365: rtmp2[idx] += damping;
1366: }
1367: }
1368: prow = *bjtmp++ + shift;
1369: while (prow < row) {
1370: pc1 = rtmp1 + prow;
1371: pc2 = rtmp2 + prow;
1372: if (*pc1 != 0.0 || *pc2 != 0.0){
1373: pv = ba + bd[prow];
1374: pj = nbj + bd[prow];
1375: mul1 = *pc1 * *pv;
1376: mul2 = *pc2 * *pv;
1377: ++pv;
1378: *pc1 = mul1;
1379: *pc2 = mul2;
1380:
1381: nz = bi[prow+1] - bd[prow] - 1;
1382: PetscLogFlops(2*2*nz);
1383: for (j=0; j<nz; j++) {
1384: tmp = pv[j];
1385: idx = pj[j];
1386: rtmps1[idx] -= mul1 * tmp;
1387: rtmps2[idx] -= mul2 * tmp;
1388: }
1389: }
1390: prow = *bjtmp++ + shift;
1391: }
1392: /* Now take care of the odd element*/
1393: pc1 = rtmp1 + prow;
1394: pc2 = rtmp2 + prow;
1395: if (*pc2 != 0.0){
1396: pj = nbj + bd[prow];
1397: if (PetscAbsScalar(*pc1) < zeropivot) {
1398: if (b->lu_damping) {
1399: damp = PETSC_TRUE;
1400: if (damping) damping *= 2.0;
1401: else damping = 1.e-12;
1402: ndamp++;
1403: goto endofwhile;
1404: } else {
1405: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1406: }
1407: }
1408: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1409: *pc2 = mul2;
1410: nz = bi[prow+1] - bd[prow] - 1;
1411: PetscLogFlops(2*nz);
1412: for (j=0; j<nz; j++) {
1413: idx = pj[j] + shift;
1414: tmp = rtmp1[idx];
1415: rtmp2[idx] -= mul2 * tmp;
1416: }
1417: }
1418:
1419: nz = bi[row+1] - bi[row];
1420: pj = bj + bi[row];
1421: pc1 = ba + bi[row];
1422: pc2 = ba + bi[row+1];
1423: if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1424: if (b->lu_damping) {
1425: damp = PETSC_TRUE;
1426: if (damping) damping *= 2.0;
1427: else damping = 1.e-12;
1428: ndamp++;
1429: goto endofwhile;
1430: } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1431: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1432: } else {
1433: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1434: }
1435: }
1436: rtmp1[row] = 1.0/rtmp1[row];
1437: rtmp2[row+1] = 1.0/rtmp2[row+1];
1438: for (j=0; j<nz; j++) {
1439: idx = pj[j];
1440: pc1[j] = rtmps1[idx];
1441: pc2[j] = rtmps2[idx];
1442: }
1443: break;
1445: case 3:
1446: for (j=0; j<nz; j++) {
1447: idx = bjtmp[j];
1448: rtmps1[idx] = 0.0;
1449: rtmps2[idx] = 0.0;
1450: rtmps3[idx] = 0.0;
1451: }
1452: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1453: idx = r[row];
1454: nz = ai[idx+1] - ai[idx];
1455: ajtmp = aj + ai[idx];
1456: v1 = aa + ai[idx];
1457: v2 = aa + ai[idx+1];
1458: v3 = aa + ai[idx+2];
1459: for (j=0; j<nz; j++) {
1460: idx = ics[ajtmp[j]];
1461: rtmp1[idx] = v1[j];
1462: rtmp2[idx] = v2[j];
1463: rtmp3[idx] = v3[j];
1464: if (ajtmp[j] == r[row]) {
1465: rtmp1[idx] += damping;
1466: }
1467: if (ajtmp[j] == r[row+1]) {
1468: rtmp2[idx] += damping;
1469: }
1470: if (ajtmp[j] == r[row+2]) {
1471: rtmp3[idx] += damping;
1472: }
1473: }
1474: /* loop over all pivot row blocks above this row block */
1475: prow = *bjtmp++ + shift;
1476: while (prow < row) {
1477: pc1 = rtmp1 + prow;
1478: pc2 = rtmp2 + prow;
1479: pc3 = rtmp3 + prow;
1480: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1481: pv = ba + bd[prow];
1482: pj = nbj + bd[prow];
1483: mul1 = *pc1 * *pv;
1484: mul2 = *pc2 * *pv;
1485: mul3 = *pc3 * *pv;
1486: ++pv;
1487: *pc1 = mul1;
1488: *pc2 = mul2;
1489: *pc3 = mul3;
1490:
1491: nz = bi[prow+1] - bd[prow] - 1;
1492: PetscLogFlops(3*2*nz);
1493: /* update this row based on pivot row */
1494: for (j=0; j<nz; j++) {
1495: tmp = pv[j];
1496: idx = pj[j];
1497: rtmps1[idx] -= mul1 * tmp;
1498: rtmps2[idx] -= mul2 * tmp;
1499: rtmps3[idx] -= mul3 * tmp;
1500: }
1501: }
1502: prow = *bjtmp++ + shift;
1503: }
1504: /* Now take care of diagonal block in this set of rows */
1505: pc1 = rtmp1 + prow;
1506: pc2 = rtmp2 + prow;
1507: pc3 = rtmp3 + prow;
1508: if (*pc2 != 0.0 || *pc3 != 0.0){
1509: pj = nbj + bd[prow];
1510: if (PetscAbsScalar(*pc1) < zeropivot) {
1511: if (b->lu_damping) {
1512: damp = PETSC_TRUE;
1513: if (damping) damping *= 2.0;
1514: else damping = 1.e-12;
1515: ndamp++;
1516: goto endofwhile;
1517: } else {
1518: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1519: }
1520: }
1521: mul2 = (*pc2)/(*pc1);
1522: mul3 = (*pc3)/(*pc1);
1523: *pc2 = mul2;
1524: *pc3 = mul3;
1525: nz = bi[prow+1] - bd[prow] - 1;
1526: PetscLogFlops(2*2*nz);
1527: for (j=0; j<nz; j++) {
1528: idx = pj[j] + shift;
1529: tmp = rtmp1[idx];
1530: rtmp2[idx] -= mul2 * tmp;
1531: rtmp3[idx] -= mul3 * tmp;
1532: }
1533: }
1534: ++prow;
1535: pc2 = rtmp2 + prow;
1536: pc3 = rtmp3 + prow;
1537: if (*pc3 != 0.0){
1538: pj = nbj + bd[prow];
1539: pj = nbj + bd[prow];
1540: if (PetscAbsScalar(*pc2) < zeropivot) {
1541: if (b->lu_damping) {
1542: damp = PETSC_TRUE;
1543: if (damping) damping *= 2.0;
1544: else damping = 1.e-12;
1545: ndamp++;
1546: goto endofwhile;
1547: } else {
1548: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1549: }
1550: }
1551: mul3 = (*pc3)/(*pc2);
1552: *pc3 = mul3;
1553: nz = bi[prow+1] - bd[prow] - 1;
1554: PetscLogFlops(2*2*nz);
1555: for (j=0; j<nz; j++) {
1556: idx = pj[j] + shift;
1557: tmp = rtmp2[idx];
1558: rtmp3[idx] -= mul3 * tmp;
1559: }
1560: }
1561: nz = bi[row+1] - bi[row];
1562: pj = bj + bi[row];
1563: pc1 = ba + bi[row];
1564: pc2 = ba + bi[row+1];
1565: pc3 = ba + bi[row+2];
1566: if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1567: if (b->lu_damping) {
1568: damp = PETSC_TRUE;
1569: if (damping) damping *= 2.0;
1570: else damping = 1.e-12;
1571: ndamp++;
1572: goto endofwhile;
1573: } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1574: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1575: } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1576: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1577: } else {
1578: SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1579: }
1580: }
1581: rtmp1[row] = 1.0/rtmp1[row];
1582: rtmp2[row+1] = 1.0/rtmp2[row+1];
1583: rtmp3[row+2] = 1.0/rtmp3[row+2];
1584: /* copy row entries from dense representation to sparse */
1585: for (j=0; j<nz; j++) {
1586: idx = pj[j];
1587: pc1[j] = rtmps1[idx];
1588: pc2[j] = rtmps2[idx];
1589: pc3[j] = rtmps3[idx];
1590: }
1591: break;
1593: default:
1594: SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1595: }
1596: row += nsz; /* Update the row */
1597: }
1598: endofwhile:;
1599: } while (damp);
1600: PetscFree(rtmp1);
1601: PetscFree(tmp_vec2);
1602: ISRestoreIndices(isicol,&ic);
1603: ISRestoreIndices(isrow,&r);
1604: ISRestoreIndices(iscol,&c);
1605: C->factor = FACTOR_LU;
1606: C->assembled = PETSC_TRUE;
1607: if (ndamp || b->lu_damping) {
1608: PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %gn",ndamp,damping);
1609: }
1610: PetscLogFlops(C->n);
1611: return(0);
1612: }
1614: /*
1615: This is really ugly. if inodes are used this replaces the
1616: permutations with ones that correspond to rows/cols of the matrix
1617: rather then inode blocks
1618: */
1619: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1620: {
1621: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1622: int ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1623: int row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1624: int nslim_col,*ns_col;
1625: IS ris = *rperm,cis = *cperm;
1626: PetscTruth flg;
1629: PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1630: if (!flg) return(0);
1632: if (!a->inode.size) return(0); /* no inodes so return */
1633: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1635: ierr = Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1636: ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1637: ierr = PetscMalloc((m+n+1)*sizeof(int),&permr);
1638: permc = permr + m;
1640: ierr = ISGetIndices(ris,&ridx);
1641: ierr = ISGetIndices(cis,&cidx);
1643: /* Form the inode structure for the rows of permuted matric using inv perm*/
1644: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1646: /* Construct the permutations for rows*/
1647: for (i=0,row = 0; i<nslim_row; ++i){
1648: indx = ridx[i];
1649: start_val = tns[indx];
1650: end_val = tns[indx + 1];
1651: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1652: }
1654: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1655: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1657: /*Construct permutations for columns*/
1658: for (i=0,col=0; i<nslim_col; ++i){
1659: indx = cidx[i];
1660: start_val = tns[indx];
1661: end_val = tns[indx + 1];
1662: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1663: }
1665: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1666: ISSetPermutation(*rperm);
1667: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1668: ISSetPermutation(*cperm);
1669:
1670: ierr = ISRestoreIndices(ris,&ridx);
1671: ierr = ISRestoreIndices(cis,&cidx);
1673: PetscFree(ns_col);
1674: PetscFree(permr);
1675: ISDestroy(cis);
1676: ISDestroy(ris);
1677: PetscFree(tns);
1678: return(0);
1679: }
1683: /*@C
1684: MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.
1686: Collective on Mat
1688: Input Parameter:
1689: . A - the SeqAIJ matrix.
1691: Output Parameter:
1692: + node_count - no of inodes present in the matrix.
1693: . sizes - an array of size node_count,with sizes of each inode.
1694: - limit - the max size used to generate the inodes.
1696: Level: advanced
1698: Notes: This routine returns some internal storage information
1699: of the matrix, it is intended to be used by advanced users.
1700: It should be called after the matrix is assembled.
1701: The contents of the sizes[] array should not be changed.
1703: .keywords: matrix, seqaij, get, inode
1705: .seealso: MatGetInfo()
1706: @*/
1707: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1708: {
1709: Mat_SeqAIJ *a;
1710: PetscTruth flg;
1711: int ierr;
1715: PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1716: if (!flg) SETERRQ(PETSC_ERR_ARG_WRONG,"MatSeqAIJ only");
1717: a = (Mat_SeqAIJ*)A->data;
1718: *node_count = a->inode.node_count;
1719: *sizes = a->inode.size;
1720: *limit = a->inode.limit;
1721: return(0);
1722: }