Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include src/mat/impls/csr/inode/inode.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_inode *a = (Mat_inode*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->n;
19: m = A->m;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_inode *a = (Mat_inode*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->m;
72: n = A->n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_inode *a = (Mat_inode*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_inode *a = (Mat_inode*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
243: if (symmetric) {
244: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
245: } else {
246: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
247: }
248: return(0);
249: }
253: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
254: {
258: if (!ia) return(0);
259: PetscFree(*ia);
260: PetscFree(*ja);
261: return(0);
262: }
264: /* ----------------------------------------------------------- */
268: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269: {
270: Mat_inode *a = (Mat_inode*)A->data;
272: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276: nslim_row = a->inode.node_count;
277: n = A->n;
279: /* Create The column_inode for this matrix */
280: Mat_CreateColInode(A,&nslim_col,&ns_col);
281:
282: /* allocate space for reformated column_inode structure */
283: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
284: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
285: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
287: for (i1=0,col=0; i1<nslim_col; ++i1){
288: nsz = ns_col[i1];
289: for (i2=0; i2<nsz; ++i2,++col)
290: tvc[col] = i1;
291: }
292: /* allocate space for column pointers */
293: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
294: *iia = ia;
295: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
296: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
298: /* determine the number of columns in each row */
299: ia[0] = oshift;
300: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
301: j = aj + ai[row] + ishift;
302: col = *j++ + ishift;
303: i2 = tvc[col];
304: nz = ai[row+1] - ai[row];
305: while (nz-- > 0) { /* off-diagonal elemets */
306: /* ia[i1+1]++; */
307: ia[i2+1]++;
308: i2++;
309: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
310: i2 = tvc[col];
311: }
312: }
314: /* shift ia[i] to point to next col */
315: for (i1=1; i1<nslim_col+1; i1++) {
316: col = ia[i1-1];
317: ia[i1] += col;
318: work[i1-1] = col - oshift;
319: }
321: /* allocate space for column pointers */
322: nz = ia[nslim_col] + (!ishift);
323: PetscMalloc(nz*sizeof(PetscInt),&ja);
324: *jja = ja;
326: /* loop over matrix putting into ja */
327: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
328: j = aj + ai[row] + ishift;
329: i2 = 0; /* Col inode index */
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
338: i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
351: {
355: Mat_CreateColInode(A,n,PETSC_NULL);
356: if (!ia) return(0);
358: if (symmetric) {
359: /* Since the indices are symmetric it does'nt matter */
360: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
361: } else {
362: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
363: }
364: return(0);
365: }
369: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
370: {
374: if (!ia) return(0);
375: PetscFree(*ia);
376: PetscFree(*ja);
377: return(0);
378: }
380: /* ----------------------------------------------------------- */
384: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
385: {
386: Mat_inode *a = (Mat_inode*)A->data;
387: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
388: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
390: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
391:
392: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
393: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
394: #endif
397: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
398: node_max = a->inode.node_count;
399: ns = a->inode.size; /* Node Size array */
400: VecGetArray(xx,&x);
401: VecGetArray(yy,&y);
402: idx = a->j;
403: v1 = a->a;
404: ii = a->i;
406: for (i = 0,row = 0; i< node_max; ++i){
407: nsz = ns[i];
408: n = ii[1] - ii[0];
409: ii += nsz;
410: sz = n; /* No of non zeros in this row */
411: /* Switch on the size of Node */
412: switch (nsz){ /* Each loop in 'case' is unrolled */
413: case 1 :
414: sum1 = 0;
415:
416: for(n = 0; n< sz-1; n+=2) {
417: i1 = idx[0]; /* The instructions are ordered to */
418: i2 = idx[1]; /* make the compiler's job easy */
419: idx += 2;
420: tmp0 = x[i1];
421: tmp1 = x[i2];
422: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423: }
424:
425: if (n == sz-1){ /* Take care of the last nonzero */
426: tmp0 = x[*idx++];
427: sum1 += *v1++ * tmp0;
428: }
429: y[row++]=sum1;
430: break;
431: case 2:
432: sum1 = 0;
433: sum2 = 0;
434: v2 = v1 + n;
435:
436: for (n = 0; n< sz-1; n+=2) {
437: i1 = idx[0];
438: i2 = idx[1];
439: idx += 2;
440: tmp0 = x[i1];
441: tmp1 = x[i2];
442: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
443: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
444: }
445: if (n == sz-1){
446: tmp0 = x[*idx++];
447: sum1 += *v1++ * tmp0;
448: sum2 += *v2++ * tmp0;
449: }
450: y[row++]=sum1;
451: y[row++]=sum2;
452: v1 =v2; /* Since the next block to be processed starts there*/
453: idx +=sz;
454: break;
455: case 3:
456: sum1 = 0;
457: sum2 = 0;
458: sum3 = 0;
459: v2 = v1 + n;
460: v3 = v2 + n;
461:
462: for (n = 0; n< sz-1; n+=2) {
463: i1 = idx[0];
464: i2 = idx[1];
465: idx += 2;
466: tmp0 = x[i1];
467: tmp1 = x[i2];
468: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
469: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
470: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
471: }
472: if (n == sz-1){
473: tmp0 = x[*idx++];
474: sum1 += *v1++ * tmp0;
475: sum2 += *v2++ * tmp0;
476: sum3 += *v3++ * tmp0;
477: }
478: y[row++]=sum1;
479: y[row++]=sum2;
480: y[row++]=sum3;
481: v1 =v3; /* Since the next block to be processed starts there*/
482: idx +=2*sz;
483: break;
484: case 4:
485: sum1 = 0;
486: sum2 = 0;
487: sum3 = 0;
488: sum4 = 0;
489: v2 = v1 + n;
490: v3 = v2 + n;
491: v4 = v3 + n;
492:
493: for (n = 0; n< sz-1; n+=2) {
494: i1 = idx[0];
495: i2 = idx[1];
496: idx += 2;
497: tmp0 = x[i1];
498: tmp1 = x[i2];
499: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
500: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
501: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
502: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
503: }
504: if (n == sz-1){
505: tmp0 = x[*idx++];
506: sum1 += *v1++ * tmp0;
507: sum2 += *v2++ * tmp0;
508: sum3 += *v3++ * tmp0;
509: sum4 += *v4++ * tmp0;
510: }
511: y[row++]=sum1;
512: y[row++]=sum2;
513: y[row++]=sum3;
514: y[row++]=sum4;
515: v1 =v4; /* Since the next block to be processed starts there*/
516: idx +=3*sz;
517: break;
518: case 5:
519: sum1 = 0;
520: sum2 = 0;
521: sum3 = 0;
522: sum4 = 0;
523: sum5 = 0;
524: v2 = v1 + n;
525: v3 = v2 + n;
526: v4 = v3 + n;
527: v5 = v4 + n;
528:
529: for (n = 0; n<sz-1; n+=2) {
530: i1 = idx[0];
531: i2 = idx[1];
532: idx += 2;
533: tmp0 = x[i1];
534: tmp1 = x[i2];
535: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
536: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
537: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
538: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
539: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
540: }
541: if (n == sz-1){
542: tmp0 = x[*idx++];
543: sum1 += *v1++ * tmp0;
544: sum2 += *v2++ * tmp0;
545: sum3 += *v3++ * tmp0;
546: sum4 += *v4++ * tmp0;
547: sum5 += *v5++ * tmp0;
548: }
549: y[row++]=sum1;
550: y[row++]=sum2;
551: y[row++]=sum3;
552: y[row++]=sum4;
553: y[row++]=sum5;
554: v1 =v5; /* Since the next block to be processed starts there */
555: idx +=4*sz;
556: break;
557: default :
558: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
559: }
560: }
561: VecRestoreArray(xx,&x);
562: VecRestoreArray(yy,&y);
563: PetscLogFlops(2*a->nz - A->m);
564: return(0);
565: }
566: /* ----------------------------------------------------------- */
567: /* Almost same code as the MatMult_Inode() */
570: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
571: {
572: Mat_inode *a = (Mat_inode*)A->data;
573: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
574: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
576: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
577:
579: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
580: node_max = a->inode.node_count;
581: ns = a->inode.size; /* Node Size array */
582: VecGetArray(xx,&x);
583: VecGetArray(yy,&y);
584: if (zz != yy) {
585: VecGetArray(zz,&z);
586: } else {
587: z = y;
588: }
589: zt = z;
591: idx = a->j;
592: v1 = a->a;
593: ii = a->i;
595: for (i = 0,row = 0; i< node_max; ++i){
596: nsz = ns[i];
597: n = ii[1] - ii[0];
598: ii += nsz;
599: sz = n; /* No of non zeros in this row */
600: /* Switch on the size of Node */
601: switch (nsz){ /* Each loop in 'case' is unrolled */
602: case 1 :
603: sum1 = *zt++;
604:
605: for(n = 0; n< sz-1; n+=2) {
606: i1 = idx[0]; /* The instructions are ordered to */
607: i2 = idx[1]; /* make the compiler's job easy */
608: idx += 2;
609: tmp0 = x[i1];
610: tmp1 = x[i2];
611: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
612: }
613:
614: if(n == sz-1){ /* Take care of the last nonzero */
615: tmp0 = x[*idx++];
616: sum1 += *v1++ * tmp0;
617: }
618: y[row++]=sum1;
619: break;
620: case 2:
621: sum1 = *zt++;
622: sum2 = *zt++;
623: v2 = v1 + n;
624:
625: for(n = 0; n< sz-1; n+=2) {
626: i1 = idx[0];
627: i2 = idx[1];
628: idx += 2;
629: tmp0 = x[i1];
630: tmp1 = x[i2];
631: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
632: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
633: }
634: if(n == sz-1){
635: tmp0 = x[*idx++];
636: sum1 += *v1++ * tmp0;
637: sum2 += *v2++ * tmp0;
638: }
639: y[row++]=sum1;
640: y[row++]=sum2;
641: v1 =v2; /* Since the next block to be processed starts there*/
642: idx +=sz;
643: break;
644: case 3:
645: sum1 = *zt++;
646: sum2 = *zt++;
647: sum3 = *zt++;
648: v2 = v1 + n;
649: v3 = v2 + n;
650:
651: for (n = 0; n< sz-1; n+=2) {
652: i1 = idx[0];
653: i2 = idx[1];
654: idx += 2;
655: tmp0 = x[i1];
656: tmp1 = x[i2];
657: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
658: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
659: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
660: }
661: if (n == sz-1){
662: tmp0 = x[*idx++];
663: sum1 += *v1++ * tmp0;
664: sum2 += *v2++ * tmp0;
665: sum3 += *v3++ * tmp0;
666: }
667: y[row++]=sum1;
668: y[row++]=sum2;
669: y[row++]=sum3;
670: v1 =v3; /* Since the next block to be processed starts there*/
671: idx +=2*sz;
672: break;
673: case 4:
674: sum1 = *zt++;
675: sum2 = *zt++;
676: sum3 = *zt++;
677: sum4 = *zt++;
678: v2 = v1 + n;
679: v3 = v2 + n;
680: v4 = v3 + n;
681:
682: for (n = 0; n< sz-1; n+=2) {
683: i1 = idx[0];
684: i2 = idx[1];
685: idx += 2;
686: tmp0 = x[i1];
687: tmp1 = x[i2];
688: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
689: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
691: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
692: }
693: if (n == sz-1){
694: tmp0 = x[*idx++];
695: sum1 += *v1++ * tmp0;
696: sum2 += *v2++ * tmp0;
697: sum3 += *v3++ * tmp0;
698: sum4 += *v4++ * tmp0;
699: }
700: y[row++]=sum1;
701: y[row++]=sum2;
702: y[row++]=sum3;
703: y[row++]=sum4;
704: v1 =v4; /* Since the next block to be processed starts there*/
705: idx +=3*sz;
706: break;
707: case 5:
708: sum1 = *zt++;
709: sum2 = *zt++;
710: sum3 = *zt++;
711: sum4 = *zt++;
712: sum5 = *zt++;
713: v2 = v1 + n;
714: v3 = v2 + n;
715: v4 = v3 + n;
716: v5 = v4 + n;
717:
718: for (n = 0; n<sz-1; n+=2) {
719: i1 = idx[0];
720: i2 = idx[1];
721: idx += 2;
722: tmp0 = x[i1];
723: tmp1 = x[i2];
724: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
725: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
726: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
727: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
728: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
729: }
730: if(n == sz-1){
731: tmp0 = x[*idx++];
732: sum1 += *v1++ * tmp0;
733: sum2 += *v2++ * tmp0;
734: sum3 += *v3++ * tmp0;
735: sum4 += *v4++ * tmp0;
736: sum5 += *v5++ * tmp0;
737: }
738: y[row++]=sum1;
739: y[row++]=sum2;
740: y[row++]=sum3;
741: y[row++]=sum4;
742: y[row++]=sum5;
743: v1 =v5; /* Since the next block to be processed starts there */
744: idx +=4*sz;
745: break;
746: default :
747: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
748: }
749: }
750: VecRestoreArray(xx,&x);
751: VecRestoreArray(yy,&y);
752: if (zz != yy) {
753: VecRestoreArray(zz,&z);
754: }
755: PetscLogFlops(2*a->nz);
756: return(0);
757: }
759: /* ----------------------------------------------------------- */
762: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
763: {
764: Mat_inode *a = (Mat_inode*)A->data;
765: IS iscol = a->col,isrow = a->row;
767: PetscInt *r,*c,i,j,n = A->m,*ai = a->i,nz,*a_j = a->j;
768: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
769: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
770: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
773: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
774: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
775: node_max = a->inode.node_count;
776: ns = a->inode.size; /* Node Size array */
778: VecGetArray(bb,&b);
779: VecGetArray(xx,&x);
780: tmp = a->solve_work;
781:
782: ISGetIndices(isrow,&rout); r = rout;
783: ISGetIndices(iscol,&cout); c = cout + (n-1);
784:
785: /* forward solve the lower triangular */
786: tmps = tmp ;
787: aa = a_a ;
788: aj = a_j ;
789: ad = a->diag;
791: for (i = 0,row = 0; i< node_max; ++i){
792: nsz = ns[i];
793: aii = ai[row];
794: v1 = aa + aii;
795: vi = aj + aii;
796: nz = ad[row]- aii;
797:
798: switch (nsz){ /* Each loop in 'case' is unrolled */
799: case 1 :
800: sum1 = b[*r++];
801: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
802: for(j=0; j<nz-1; j+=2){
803: i0 = vi[0];
804: i1 = vi[1];
805: vi +=2;
806: tmp0 = tmps[i0];
807: tmp1 = tmps[i1];
808: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809: }
810: if(j == nz-1){
811: tmp0 = tmps[*vi++];
812: sum1 -= *v1++ *tmp0;
813: }
814: tmp[row ++]=sum1;
815: break;
816: case 2:
817: sum1 = b[*r++];
818: sum2 = b[*r++];
819: v2 = aa + ai[row+1];
821: for(j=0; j<nz-1; j+=2){
822: i0 = vi[0];
823: i1 = vi[1];
824: vi +=2;
825: tmp0 = tmps[i0];
826: tmp1 = tmps[i1];
827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829: }
830: if(j == nz-1){
831: tmp0 = tmps[*vi++];
832: sum1 -= *v1++ *tmp0;
833: sum2 -= *v2++ *tmp0;
834: }
835: sum2 -= *v2++ * sum1;
836: tmp[row ++]=sum1;
837: tmp[row ++]=sum2;
838: break;
839: case 3:
840: sum1 = b[*r++];
841: sum2 = b[*r++];
842: sum3 = b[*r++];
843: v2 = aa + ai[row+1];
844: v3 = aa + ai[row+2];
845:
846: for (j=0; j<nz-1; j+=2){
847: i0 = vi[0];
848: i1 = vi[1];
849: vi +=2;
850: tmp0 = tmps[i0];
851: tmp1 = tmps[i1];
852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855: }
856: if (j == nz-1){
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: sum3 -= *v3++ *tmp0;
861: }
862: sum2 -= *v2++ * sum1;
863: sum3 -= *v3++ * sum1;
864: sum3 -= *v3++ * sum2;
865: tmp[row ++]=sum1;
866: tmp[row ++]=sum2;
867: tmp[row ++]=sum3;
868: break;
869:
870: case 4:
871: sum1 = b[*r++];
872: sum2 = b[*r++];
873: sum3 = b[*r++];
874: sum4 = b[*r++];
875: v2 = aa + ai[row+1];
876: v3 = aa + ai[row+2];
877: v4 = aa + ai[row+3];
878:
879: for (j=0; j<nz-1; j+=2){
880: i0 = vi[0];
881: i1 = vi[1];
882: vi +=2;
883: tmp0 = tmps[i0];
884: tmp1 = tmps[i1];
885: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
886: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
887: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
888: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
889: }
890: if (j == nz-1){
891: tmp0 = tmps[*vi++];
892: sum1 -= *v1++ *tmp0;
893: sum2 -= *v2++ *tmp0;
894: sum3 -= *v3++ *tmp0;
895: sum4 -= *v4++ *tmp0;
896: }
897: sum2 -= *v2++ * sum1;
898: sum3 -= *v3++ * sum1;
899: sum4 -= *v4++ * sum1;
900: sum3 -= *v3++ * sum2;
901: sum4 -= *v4++ * sum2;
902: sum4 -= *v4++ * sum3;
903:
904: tmp[row ++]=sum1;
905: tmp[row ++]=sum2;
906: tmp[row ++]=sum3;
907: tmp[row ++]=sum4;
908: break;
909: case 5:
910: sum1 = b[*r++];
911: sum2 = b[*r++];
912: sum3 = b[*r++];
913: sum4 = b[*r++];
914: sum5 = b[*r++];
915: v2 = aa + ai[row+1];
916: v3 = aa + ai[row+2];
917: v4 = aa + ai[row+3];
918: v5 = aa + ai[row+4];
919:
920: for (j=0; j<nz-1; j+=2){
921: i0 = vi[0];
922: i1 = vi[1];
923: vi +=2;
924: tmp0 = tmps[i0];
925: tmp1 = tmps[i1];
926: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
927: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
928: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
929: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
930: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
931: }
932: if (j == nz-1){
933: tmp0 = tmps[*vi++];
934: sum1 -= *v1++ *tmp0;
935: sum2 -= *v2++ *tmp0;
936: sum3 -= *v3++ *tmp0;
937: sum4 -= *v4++ *tmp0;
938: sum5 -= *v5++ *tmp0;
939: }
941: sum2 -= *v2++ * sum1;
942: sum3 -= *v3++ * sum1;
943: sum4 -= *v4++ * sum1;
944: sum5 -= *v5++ * sum1;
945: sum3 -= *v3++ * sum2;
946: sum4 -= *v4++ * sum2;
947: sum5 -= *v5++ * sum2;
948: sum4 -= *v4++ * sum3;
949: sum5 -= *v5++ * sum3;
950: sum5 -= *v5++ * sum4;
951:
952: tmp[row ++]=sum1;
953: tmp[row ++]=sum2;
954: tmp[row ++]=sum3;
955: tmp[row ++]=sum4;
956: tmp[row ++]=sum5;
957: break;
958: default:
959: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
960: }
961: }
962: /* backward solve the upper triangular */
963: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
964: nsz = ns[i];
965: aii = ai[row+1] -1;
966: v1 = aa + aii;
967: vi = aj + aii;
968: nz = aii- ad[row];
969: switch (nsz){ /* Each loop in 'case' is unrolled */
970: case 1 :
971: sum1 = tmp[row];
973: for(j=nz ; j>1; j-=2){
974: i0 = vi[0];
975: i1 = vi[-1];
976: vi -=2;
977: tmp0 = tmps[i0];
978: tmp1 = tmps[i1];
979: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
980: }
981: if (j==1){
982: tmp0 = tmps[*vi--];
983: sum1 -= *v1-- * tmp0;
984: }
985: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
986: break;
987: case 2 :
988: sum1 = tmp[row];
989: sum2 = tmp[row -1];
990: v2 = aa + ai[row]-1;
991: for (j=nz ; j>1; j-=2){
992: i0 = vi[0];
993: i1 = vi[-1];
994: vi -=2;
995: tmp0 = tmps[i0];
996: tmp1 = tmps[i1];
997: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
998: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
999: }
1000: if (j==1){
1001: tmp0 = tmps[*vi--];
1002: sum1 -= *v1-- * tmp0;
1003: sum2 -= *v2-- * tmp0;
1004: }
1005:
1006: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1007: sum2 -= *v2-- * tmp0;
1008: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1009: break;
1010: case 3 :
1011: sum1 = tmp[row];
1012: sum2 = tmp[row -1];
1013: sum3 = tmp[row -2];
1014: v2 = aa + ai[row]-1;
1015: v3 = aa + ai[row -1]-1;
1016: for (j=nz ; j>1; j-=2){
1017: i0 = vi[0];
1018: i1 = vi[-1];
1019: vi -=2;
1020: tmp0 = tmps[i0];
1021: tmp1 = tmps[i1];
1022: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1023: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1024: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1025: }
1026: if (j==1){
1027: tmp0 = tmps[*vi--];
1028: sum1 -= *v1-- * tmp0;
1029: sum2 -= *v2-- * tmp0;
1030: sum3 -= *v3-- * tmp0;
1031: }
1032: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1033: sum2 -= *v2-- * tmp0;
1034: sum3 -= *v3-- * tmp0;
1035: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1036: sum3 -= *v3-- * tmp0;
1037: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1038:
1039: break;
1040: case 4 :
1041: sum1 = tmp[row];
1042: sum2 = tmp[row -1];
1043: sum3 = tmp[row -2];
1044: sum4 = tmp[row -3];
1045: v2 = aa + ai[row]-1;
1046: v3 = aa + ai[row -1]-1;
1047: v4 = aa + ai[row -2]-1;
1049: for (j=nz ; j>1; j-=2){
1050: i0 = vi[0];
1051: i1 = vi[-1];
1052: vi -=2;
1053: tmp0 = tmps[i0];
1054: tmp1 = tmps[i1];
1055: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1056: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1057: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1058: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1059: }
1060: if (j==1){
1061: tmp0 = tmps[*vi--];
1062: sum1 -= *v1-- * tmp0;
1063: sum2 -= *v2-- * tmp0;
1064: sum3 -= *v3-- * tmp0;
1065: sum4 -= *v4-- * tmp0;
1066: }
1068: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1069: sum2 -= *v2-- * tmp0;
1070: sum3 -= *v3-- * tmp0;
1071: sum4 -= *v4-- * tmp0;
1072: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1073: sum3 -= *v3-- * tmp0;
1074: sum4 -= *v4-- * tmp0;
1075: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1076: sum4 -= *v4-- * tmp0;
1077: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1078: break;
1079: case 5 :
1080: sum1 = tmp[row];
1081: sum2 = tmp[row -1];
1082: sum3 = tmp[row -2];
1083: sum4 = tmp[row -3];
1084: sum5 = tmp[row -4];
1085: v2 = aa + ai[row]-1;
1086: v3 = aa + ai[row -1]-1;
1087: v4 = aa + ai[row -2]-1;
1088: v5 = aa + ai[row -3]-1;
1089: for (j=nz ; j>1; j-=2){
1090: i0 = vi[0];
1091: i1 = vi[-1];
1092: vi -=2;
1093: tmp0 = tmps[i0];
1094: tmp1 = tmps[i1];
1095: sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1096: sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1097: sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1098: sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1099: sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1100: }
1101: if (j==1){
1102: tmp0 = tmps[*vi--];
1103: sum1 -= *v1-- * tmp0;
1104: sum2 -= *v2-- * tmp0;
1105: sum3 -= *v3-- * tmp0;
1106: sum4 -= *v4-- * tmp0;
1107: sum5 -= *v5-- * tmp0;
1108: }
1110: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1111: sum2 -= *v2-- * tmp0;
1112: sum3 -= *v3-- * tmp0;
1113: sum4 -= *v4-- * tmp0;
1114: sum5 -= *v5-- * tmp0;
1115: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1116: sum3 -= *v3-- * tmp0;
1117: sum4 -= *v4-- * tmp0;
1118: sum5 -= *v5-- * tmp0;
1119: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1120: sum4 -= *v4-- * tmp0;
1121: sum5 -= *v5-- * tmp0;
1122: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1123: sum5 -= *v5-- * tmp0;
1124: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1125: break;
1126: default:
1127: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1128: }
1129: }
1130: ISRestoreIndices(isrow,&rout);
1131: ISRestoreIndices(iscol,&cout);
1132: VecRestoreArray(bb,&b);
1133: VecRestoreArray(xx,&x);
1134: PetscLogFlops(2*a->nz - A->n);
1135: return(0);
1136: }
1140: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1141: {
1142: Mat C = *B;
1143: Mat_inode *a = (Mat_inode*)A->data,*b = (Mat_inode*)C->data;
1144: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1146: PetscInt *r,*ic,*c,n = A->m,*bi = b->i;
1147: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1148: PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1149: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1150: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1151: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1152: PetscReal rs=0.0,rsum[3];
1153: LUShift_Ctx sctx;
1154: PetscInt newshift;
1157: /* if both shift schemes are chosen by user, only use info->shiftpd */
1158: if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1159: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1160: sctx.shift_top = 0;
1161: for (i=0; i<n; i++) {
1162: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1163: rs = 0.0;
1164: ajtmp = aj + ai[i];
1165: rtmp1 = aa + ai[i];
1166: nz = ai[i+1] - ai[i];
1167: for (j=0; j<nz; j++){
1168: if (*ajtmp != i){
1169: rs += PetscAbsScalar(*rtmp1++);
1170: } else {
1171: rs -= PetscRealPart(*rtmp1++);
1172: }
1173: ajtmp++;
1174: }
1175: if (rs>sctx.shift_top) sctx.shift_top = rs;
1176: }
1177: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1178: sctx.shift_top *= 1.1;
1179: sctx.nshift_max = 5;
1180: sctx.shift_lo = 0.;
1181: sctx.shift_hi = 1.;
1182: }
1183: sctx.shift_amount = 0;
1184: sctx.nshift = 0;
1186: ISGetIndices(isrow,&r);
1187: ISGetIndices(iscol,&c);
1188: ISGetIndices(isicol,&ic);
1189: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1190: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1191: ics = ic ; rtmps1 = rtmp1 ;
1192: rtmp2 = rtmp1 + n; rtmps2 = rtmp2 ;
1193: rtmp3 = rtmp2 + n; rtmps3 = rtmp3 ;
1194:
1195: node_max = a->inode.node_count;
1196: ns = a->inode.size ;
1197: if (!ns){
1198: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1199: }
1201: /* If max inode size > 3, split it into two inodes.*/
1202: /* also map the inode sizes according to the ordering */
1203: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1204: for (i=0,j=0; i<node_max; ++i,++j){
1205: if (ns[i]>3) {
1206: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1207: ++j;
1208: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1209: } else {
1210: tmp_vec1[j] = ns[i];
1211: }
1212: }
1213: /* Use the correct node_max */
1214: node_max = j;
1216: /* Now reorder the inode info based on mat re-ordering info */
1217: /* First create a row -> inode_size_array_index map */
1218: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1219: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1220: for (i=0,row=0; i<node_max; i++) {
1221: nodesz = tmp_vec1[i];
1222: for (j=0; j<nodesz; j++,row++) {
1223: nsmap[row] = i;
1224: }
1225: }
1226: /* Using nsmap, create a reordered ns structure */
1227: for (i=0,j=0; i< node_max; i++) {
1228: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1229: tmp_vec2[i] = nodesz;
1230: j += nodesz;
1231: }
1232: PetscFree(nsmap);
1233: PetscFree(tmp_vec1);
1234: /* Now use the correct ns */
1235: ns = tmp_vec2;
1237: do {
1238: sctx.lushift = PETSC_FALSE;
1239: /* Now loop over each block-row, and do the factorization */
1240: for (i=0,row=0; i<node_max; i++) {
1241: nodesz = ns[i];
1242: nz = bi[row+1] - bi[row];
1243: bjtmp = bj + bi[row];
1245: switch (nodesz){
1246: case 1:
1247: for (j=0; j<nz; j++){
1248: idx = bjtmp[j];
1249: rtmps1[idx] = 0.0;
1250: }
1251:
1252: /* load in initial (unfactored row) */
1253: idx = r[row];
1254: nz = ai[idx+1] - ai[idx];
1255: ajtmp = aj + ai[idx];
1256: v1 = aa + ai[idx];
1258: for (j=0; j<nz; j++) {
1259: idx = ics[ajtmp[j]];
1260: rtmp1[idx] = v1[j];
1261: if (sctx.nshift && ajtmp[j] == r[row]) {
1262: rtmp1[idx] += sctx.shift_amount;
1263: }
1264: }
1265: prow = *bjtmp++ ;
1266: while (prow < row) {
1267: pc1 = rtmp1 + prow;
1268: if (*pc1 != 0.0){
1269: pv = ba + bd[prow];
1270: pj = nbj + bd[prow];
1271: mul1 = *pc1 * *pv++;
1272: *pc1 = mul1;
1273: nz = bi[prow+1] - bd[prow] - 1;
1274: PetscLogFlops(2*nz);
1275: for (j=0; j<nz; j++) {
1276: tmp = pv[j];
1277: idx = pj[j];
1278: rtmps1[idx] -= mul1 * tmp;
1279: }
1280: }
1281: prow = *bjtmp++ ;
1282: }
1283: nz = bi[row+1] - bi[row];
1284: pj = bj + bi[row];
1285: pc1 = ba + bi[row];
1287: sctx.pv = rtmp1[row];
1288: rs = 0.0;
1289: rtmp1[row] = 1.0/rtmp1[row];
1290: for (j=0; j<nz; j++) {
1291: idx = pj[j];
1292: pc1[j] = rtmps1[idx];
1293: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1294: }
1296: sctx.rs = rs;
1297: MatLUCheckShift_inline(info,sctx,newshift);
1298: if (newshift == 1){
1299: goto endofwhile;
1300: } else if (newshift == -1){
1301: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g, inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1302: }
1303: break;
1304:
1305: case 2:
1306: for (j=0; j<nz; j++) {
1307: idx = bjtmp[j];
1308: rtmps1[idx] = 0.0;
1309: rtmps2[idx] = 0.0;
1310: }
1311:
1312: /* load in initial (unfactored row) */
1313: idx = r[row];
1314: nz = ai[idx+1] - ai[idx];
1315: ajtmp = aj + ai[idx];
1316: v1 = aa + ai[idx];
1317: v2 = aa + ai[idx+1];
1318:
1319: for (j=0; j<nz; j++) {
1320: idx = ics[ajtmp[j]];
1321: rtmp1[idx] = v1[j];
1322: rtmp2[idx] = v2[j];
1323: if (sctx.nshift && ajtmp[j] == r[row]) {
1324: rtmp1[idx] += sctx.shift_amount;
1325: }
1326: if (sctx.nshift && ajtmp[j] == r[row+1]) {
1327: rtmp2[idx] += sctx.shift_amount;
1328: }
1329: }
1330: prow = *bjtmp++ ;
1331: while (prow < row) {
1332: pc1 = rtmp1 + prow;
1333: pc2 = rtmp2 + prow;
1334: if (*pc1 != 0.0 || *pc2 != 0.0){
1335: pv = ba + bd[prow];
1336: pj = nbj + bd[prow];
1337: mul1 = *pc1 * *pv;
1338: mul2 = *pc2 * *pv;
1339: ++pv;
1340: *pc1 = mul1;
1341: *pc2 = mul2;
1342:
1343: nz = bi[prow+1] - bd[prow] - 1;
1344: PetscLogFlops(2*2*nz);
1345: for (j=0; j<nz; j++) {
1346: tmp = pv[j];
1347: idx = pj[j];
1348: rtmps1[idx] -= mul1 * tmp;
1349: rtmps2[idx] -= mul2 * tmp;
1350: }
1351: }
1352: prow = *bjtmp++ ;
1353: }
1354: /* Now take care of the odd element*/
1355: pc1 = rtmp1 + prow;
1356: pc2 = rtmp2 + prow;
1357: if (*pc2 != 0.0){
1358: pj = nbj + bd[prow];
1360: rs = 0.0;
1361: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1362: *pc2 = mul2;
1363: nz = bi[prow+1] - bd[prow] - 1;
1364: PetscLogFlops(2*nz);
1365: for (j=0; j<nz; j++) {
1366: idx = pj[j] ;
1367: tmp = rtmp1[idx];
1368: rtmp2[idx] -= mul2 * tmp;
1369: if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1370: }
1371:
1372: sctx.rs = rs;
1373: sctx.pv = *pc1;
1374: MatLUCheckShift_inline(info,sctx,newshift);
1375: if (newshift == 1){
1376: goto endofwhile; /* sctx.shift_amount is updated */
1377: } else if (newshift == -1){
1378: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1379: }
1380: }
1381:
1382: nz = bi[row+1] - bi[row];
1383: pj = bj + bi[row];
1384: pc1 = ba + bi[row];
1385: pc2 = ba + bi[row+1];
1387: rsum[0] = rsum[1] = 0.0;
1388: rtmp1[row] = 1.0/rtmp1[row];
1389: rtmp2[row+1] = 1.0/rtmp2[row+1];
1390: for (j=0; j<nz; j++) {
1391: idx = pj[j];
1392: pc1[j] = rtmps1[idx];
1393: pc2[j] = rtmps2[idx];
1394: if (idx != row) rsum[0] += PetscAbsScalar(pc1[j]);
1395: if (idx != row+1) rsum[1] += PetscAbsScalar(pc2[j]);
1396: }
1398: sctx.pv = 1.0/rtmp1[row]; /* rtmp1[row] = 1.0/diag[row] */
1399: sctx.rs = rsum[0];
1400: MatLUCheckShift_inline(info,sctx,newshift);
1401: if (newshift == 1){
1402: goto endofwhile;
1403: } else if (newshift == -1){
1404: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1405: }
1406: sctx.pv = 1.0/rtmp2[row+1];
1407: sctx.rs = rsum[1];
1408: MatLUCheckShift_inline(info,sctx,newshift);
1409: if (newshift == 1){
1410: goto endofwhile;
1411: } else if (newshift == -1){
1412: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D",row+1,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1413: }
1414: break;
1416: case 3:
1417: for (j=0; j<nz; j++) {
1418: idx = bjtmp[j];
1419: rtmps1[idx] = 0.0;
1420: rtmps2[idx] = 0.0;
1421: rtmps3[idx] = 0.0;
1422: }
1423: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1424: idx = r[row];
1425: nz = ai[idx+1] - ai[idx];
1426: ajtmp = aj + ai[idx];
1427: v1 = aa + ai[idx];
1428: v2 = aa + ai[idx+1];
1429: v3 = aa + ai[idx+2];
1430: for (j=0; j<nz; j++) {
1431: idx = ics[ajtmp[j]];
1432: rtmp1[idx] = v1[j];
1433: rtmp2[idx] = v2[j];
1434: rtmp3[idx] = v3[j];
1435: if (sctx.nshift && ajtmp[j] == r[row]) {
1436: rtmp1[idx] += sctx.shift_amount;
1437: }
1438: if (sctx.nshift && ajtmp[j] == r[row+1]) {
1439: rtmp2[idx] += sctx.shift_amount;
1440: }
1441: if (sctx.nshift && ajtmp[j] == r[row+2]) {
1442: rtmp3[idx] += sctx.shift_amount;
1443: }
1444: }
1445: /* loop over all pivot row blocks above this row block */
1446: prow = *bjtmp++ ;
1447: while (prow < row) {
1448: pc1 = rtmp1 + prow;
1449: pc2 = rtmp2 + prow;
1450: pc3 = rtmp3 + prow;
1451: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1452: pv = ba + bd[prow];
1453: pj = nbj + bd[prow];
1454: mul1 = *pc1 * *pv;
1455: mul2 = *pc2 * *pv;
1456: mul3 = *pc3 * *pv;
1457: ++pv;
1458: *pc1 = mul1;
1459: *pc2 = mul2;
1460: *pc3 = mul3;
1461:
1462: nz = bi[prow+1] - bd[prow] - 1;
1463: PetscLogFlops(3*2*nz);
1464: /* update this row based on pivot row */
1465: for (j=0; j<nz; j++) {
1466: tmp = pv[j];
1467: idx = pj[j];
1468: rtmps1[idx] -= mul1 * tmp;
1469: rtmps2[idx] -= mul2 * tmp;
1470: rtmps3[idx] -= mul3 * tmp;
1471: }
1472: }
1473: prow = *bjtmp++ ;
1474: }
1475: /* Now take care of diagonal block in this set of rows */
1476: pc1 = rtmp1 + prow;
1477: pc2 = rtmp2 + prow;
1478: pc3 = rtmp3 + prow;
1479: if (*pc2 != 0.0 || *pc3 != 0.0){
1480: pj = nbj + bd[prow];
1482: sctx.rs = 1.0; /* for simplicity, set rs=1.0 */
1483: sctx.pv = *pc1;
1484: MatLUCheckShift_inline(info,sctx,newshift);
1485: if (newshift == 1){
1486: goto endofwhile;
1487: } else if (newshift == -1){
1488: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1489: }
1491: mul2 = (*pc2)/(*pc1);
1492: mul3 = (*pc3)/(*pc1);
1493: *pc2 = mul2;
1494: *pc3 = mul3;
1495: nz = bi[prow+1] - bd[prow] - 1;
1496: PetscLogFlops(2*2*nz);
1497: for (j=0; j<nz; j++) {
1498: idx = pj[j] ;
1499: tmp = rtmp1[idx];
1500: rtmp2[idx] -= mul2 * tmp;
1501: rtmp3[idx] -= mul3 * tmp;
1502: }
1503: }
1504: ++prow;
1505: pc2 = rtmp2 + prow;
1506: pc3 = rtmp3 + prow;
1507: if (*pc3 != 0.0){
1508: pj = nbj + bd[prow];
1509: pj = nbj + bd[prow];
1511: sctx.rs = 1.0; /* for simplicity, set rs=1.0 */
1512: sctx.pv = *pc2;
1513: MatLUCheckShift_inline(info,sctx,newshift);
1514: if (newshift == 1){
1515: goto endofwhile;
1516: } else if (newshift == -1){
1517: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1518: }
1519: mul3 = (*pc3)/(*pc2);
1520: *pc3 = mul3;
1521: nz = bi[prow+1] - bd[prow] - 1;
1522: PetscLogFlops(2*2*nz);
1523: for (j=0; j<nz; j++) {
1524: idx = pj[j] ;
1525: tmp = rtmp2[idx];
1526: rtmp3[idx] -= mul3 * tmp;
1527: }
1528: }
1529: nz = bi[row+1] - bi[row];
1530: pj = bj + bi[row];
1531: pc1 = ba + bi[row];
1532: pc2 = ba + bi[row+1];
1533: pc3 = ba + bi[row+2];
1535: rsum[0] = rsum[1] = rsum[2] = 0.0;
1536: rtmp1[row] = 1.0/rtmp1[row];
1537: rtmp2[row+1] = 1.0/rtmp2[row+1];
1538: rtmp3[row+2] = 1.0/rtmp3[row+2];
1539: /* copy row entries from dense representation to sparse */
1540: for (j=0; j<nz; j++) {
1541: idx = pj[j];
1542: pc1[j] = rtmps1[idx];
1543: pc2[j] = rtmps2[idx];
1544: pc3[j] = rtmps3[idx];
1545: if (idx != row) rsum[0] += PetscAbsScalar(pc1[j]);
1546: if (idx != row+1) rsum[1] += PetscAbsScalar(pc2[j]);
1547: if (idx != row+2) rsum[2] += PetscAbsScalar(pc3[j]);
1548: }
1550: /* sctx.rs = rs/3.0; */
1551: sctx.pv = 1.0/rtmp1[row];
1552: sctx.rs = rsum[0];
1553: MatLUCheckShift_inline(info,sctx,newshift);
1554: if (newshift == 1){
1555: goto endofwhile;
1556: } else if (newshift == -1){
1557: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1558: }
1559: sctx.pv = 1.0/rtmp2[row+1];
1560: sctx.rs = rsum[1];
1561: MatLUCheckShift_inline(info,sctx,newshift);
1562: if (newshift == 1){
1563: goto endofwhile;
1564: } else if (newshift == -1){
1565: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D, 2nd row",row+1,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1566: }
1567: sctx.pv = 1.0/rtmp3[row+2];
1568: sctx.rs = rsum[2];
1569: MatLUCheckShift_inline(info,sctx,newshift);
1570: if (newshift == 1){
1571: goto endofwhile;
1572: } else if (newshift == -1){
1573: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g inode.size %D, 3rd row",row+2,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1574: }
1575: break;
1576: default:
1577: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1578: }
1579: row += nodesz; /* Update the row */
1580: }
1581: endofwhile:;
1582: } while (sctx.lushift);
1583: PetscFree(rtmp1);
1584: PetscFree(tmp_vec2);
1585: ISRestoreIndices(isicol,&ic);
1586: ISRestoreIndices(isrow,&r);
1587: ISRestoreIndices(iscol,&c);
1588: C->factor = FACTOR_LU;
1589: C->assembled = PETSC_TRUE;
1590: if (sctx.nshift) {
1591: if (info->shiftnz) {
1592: PetscLogInfo((0,"MatLUFactorNumeric_Inode: number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));
1593: } else if (info->shiftpd) {
1594: PetscLogInfo((0,"MatLUFactorNumeric_Inode: number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top));
1595: }
1596: }
1597: PetscLogFlops(C->n);
1598: return(0);
1599: }
1601: /*
1602: Makes a longer coloring[] array and calls the usual code with that
1603: */
1606: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt nin,PetscInt ncolors,ISColoringValue coloring[],ISColoring *iscoloring)
1607: {
1608: Mat_inode *a = (Mat_inode*)mat->data;
1609: PetscErrorCode ierr;
1610: PetscInt n = mat->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1611: PetscInt *colorused,i;
1612: ISColoringValue *newcolor;
1615: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1616: /* loop over inodes, marking a color for each column*/
1617: row = 0;
1618: for (i=0; i<m; i++){
1619: for (j=0; j<ns[i]; j++) {
1620: newcolor[row++] = coloring[i] + j*ncolors;
1621: }
1622: }
1624: /* eliminate unneeded colors */
1625: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1626: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1627: for (i=0; i<n; i++) {
1628: colorused[newcolor[i]] = 1;
1629: }
1631: for (i=1; i<5*ncolors; i++) {
1632: colorused[i] += colorused[i-1];
1633: }
1634: ncolors = colorused[5*ncolors-1];
1635: for (i=0; i<n; i++) {
1636: newcolor[i] = colorused[newcolor[i]];
1637: }
1638: PetscFree(colorused);
1639: ISColoringCreate(mat->comm,n,newcolor,iscoloring);
1640: PetscFree(coloring);
1641: return(0);
1642: }
1644: /*
1645: samestructure indicates that the matrix has not changed its nonzero structure so we
1646: do not need to recompute the inodes
1647: */
1650: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1651: {
1652: Mat_inode *a = (Mat_inode*)A->data;
1654: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1655: PetscTruth flag,flg;
1658: if (a->inode.checked && samestructure) return(0);
1660: a->inode.checked = PETSC_TRUE;
1662: /* Notes: We set a->inode.limit=5 in MatCreate_Inode(). */
1663: if (!a->inode.use) {PetscLogInfo((A,"Mat_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n")); return(0);}
1664: PetscOptionsHasName(A->prefix,"-mat_no_inode",&flg);
1665: if (flg) {PetscLogInfo((A,"Mat_CheckInode: Not using Inode routines due to -mat_no_inode\n"));return(0);}
1666: PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
1667: if (flg) {PetscLogInfo((A,"Mat_CheckInode: Not using Inode routines due to -mat_no_unroll\n"));return(0);}
1668: PetscOptionsGetInt(A->prefix,"-mat_inode_limit",&a->inode.limit,PETSC_NULL);
1669: if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
1670: m = A->m;
1671: if (a->inode.size) {ns = a->inode.size;}
1672: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
1674: i = 0;
1675: node_count = 0;
1676: idx = a->j;
1677: ii = a->i;
1678: while (i < m){ /* For each row */
1679: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
1680: /* Limits the number of elements in a node to 'a->inode.limit' */
1681: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1682: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
1683: if (nzy != nzx) break;
1684: idy += nzx; /* Same nonzero pattern */
1685: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
1686: if (!flag) break;
1687: }
1688: ns[node_count++] = blk_size;
1689: idx += blk_size*nzx;
1690: i = j;
1691: }
1692: /* If not enough inodes found,, do not use inode version of the routines */
1693: if (!a->inode.size && m && node_count > 0.9*m) {
1694: PetscFree(ns);
1695: a->inode.node_count = 0;
1696: a->inode.size = PETSC_NULL;
1697: a->inode.use = PETSC_FALSE;
1698: PetscLogInfo((A,"Mat_CheckInode: Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m));
1699: } else {
1700: A->ops->mult = MatMult_Inode;
1701: A->ops->multadd = MatMultAdd_Inode;
1702: A->ops->solve = MatSolve_Inode;
1703: A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1704: A->ops->getrowij = MatGetRowIJ_Inode;
1705: A->ops->restorerowij = MatRestoreRowIJ_Inode;
1706: A->ops->getcolumnij = MatGetColumnIJ_Inode;
1707: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1708: A->ops->coloringpatch = MatColoringPatch_Inode;
1709: a->inode.node_count = node_count;
1710: a->inode.size = ns;
1711: PetscLogInfo((A,"Mat_CheckInode: Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit));
1712: }
1713: return(0);
1714: }
1716: /*
1717: This is really ugly. if inodes are used this replaces the
1718: permutations with ones that correspond to rows/cols of the matrix
1719: rather then inode blocks
1720: */
1723: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1724: {
1725: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
1728: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
1729: if (f) {
1730: (*f)(A,rperm,cperm);
1731: }
1732: return(0);
1733: }
1738: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1739: {
1740: Mat_inode *a=(Mat_inode*)A->data;
1742: PetscInt m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1743: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1744: PetscInt nslim_col,*ns_col;
1745: IS ris = *rperm,cis = *cperm;
1748: if (!a->inode.size) return(0); /* no inodes so return */
1749: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1751: Mat_CreateColInode(A,&nslim_col,&ns_col);
1752: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1753: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1754: permc = permr + m;
1756: ISGetIndices(ris,&ridx);
1757: ISGetIndices(cis,&cidx);
1759: /* Form the inode structure for the rows of permuted matric using inv perm*/
1760: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1762: /* Construct the permutations for rows*/
1763: for (i=0,row = 0; i<nslim_row; ++i){
1764: indx = ridx[i];
1765: start_val = tns[indx];
1766: end_val = tns[indx + 1];
1767: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1768: }
1770: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1771: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1773: /* Construct permutations for columns */
1774: for (i=0,col=0; i<nslim_col; ++i){
1775: indx = cidx[i];
1776: start_val = tns[indx];
1777: end_val = tns[indx + 1];
1778: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1779: }
1781: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1782: ISSetPermutation(*rperm);
1783: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1784: ISSetPermutation(*cperm);
1785:
1786: ISRestoreIndices(ris,&ridx);
1787: ISRestoreIndices(cis,&cidx);
1789: PetscFree(ns_col);
1790: PetscFree(permr);
1791: ISDestroy(cis);
1792: ISDestroy(ris);
1793: PetscFree(tns);
1794: return(0);
1795: }
1800: /*@C
1801: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
1803: Collective on Mat
1805: Input Parameter:
1806: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
1808: Output Parameter:
1809: + node_count - no of inodes present in the matrix.
1810: . sizes - an array of size node_count,with sizes of each inode.
1811: - limit - the max size used to generate the inodes.
1813: Level: advanced
1815: Notes: This routine returns some internal storage information
1816: of the matrix, it is intended to be used by advanced users.
1817: It should be called after the matrix is assembled.
1818: The contents of the sizes[] array should not be changed.
1819: PETSC_NULL may be passed for information not requested.
1821: .keywords: matrix, seqaij, get, inode
1823: .seealso: MatGetInfo()
1824: @*/
1825: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1826: {
1827: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
1830: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1831: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
1832: if (f) {
1833: (*f)(A,node_count,sizes,limit);
1834: }
1835: return(0);
1836: }
1841: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1842: {
1843: Mat_inode *a = (Mat_inode*)A->data;
1846: if (node_count) *node_count = a->inode.node_count;
1847: if (sizes) *sizes = a->inode.size;
1848: if (limit) *limit = a->inode.limit;
1849: return(0);
1850: }