Actual source code: sbaijfact.c
1: /*$Id: sbaijfact.c,v 1.61 2001/08/06 21:15:47 bsmith Exp $*/
2: /* Using Modified Sparse Row (MSR) storage.
3: See page 85, "Iterative Methods ..." by Saad. */
5: /*
6: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7: */
8: #include src/mat/impls/baij/seq/baij.h
9: #include src/mat/impls/sbaij/seq/sbaij.h
10: #include src/vec/vecimpl.h
11: #include src/inline/ilu.h
12: #include include/petscis.h
14: int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
15: {
16: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
17: int *rip,ierr,i,mbs = a->mbs,*ai,*aj;
18: int *jutmp,bs = a->bs,bs2=a->bs2;
19: int m,nzi,realloc = 0;
20: int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
21: PetscTruth perm_identity;
25: /* check whether perm is the identity mapping */
26: ISIdentity(perm,&perm_identity);
27: if (!perm_identity) a->permute = PETSC_TRUE;
28:
29: ISGetIndices(perm,&rip);
30:
31: if (perm_identity){ /* without permutation */
32: ai = a->i; aj = a->j;
33: } else { /* non-trivial permutation */
34: MatReorderingSeqSBAIJ(A,perm);
35: ai = a->inew; aj = a->jnew;
36: }
37:
38: /* initialization */
39: /* Don't know how many column pointers are needed so estimate.
40: Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
41: ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);
42: umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
43: ierr = PetscMalloc(umax*sizeof(int),&ju);
44: iu[0] = mbs+1;
45: juptr = mbs;
46: ierr = PetscMalloc(mbs*sizeof(int),&jl);
47: ierr = PetscMalloc(mbs*sizeof(int),&q);
48: for (i=0; i<mbs; i++){
49: jl[i] = mbs; q[i] = 0;
50: }
52: /* for each row k */
53: for (k=0; k<mbs; k++){
54: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
55: q[k] = mbs;
56: /* initialize nonzero structure of k-th row to row rip[k] of A */
57: jmin = ai[rip[k]];
58: jmax = ai[rip[k]+1];
59: for (j=jmin; j<jmax; j++){
60: vj = rip[aj[j]]; /* col. value */
61: if(vj > k){
62: qm = k;
63: do {
64: m = qm; qm = q[m];
65: } while(qm < vj);
66: if (qm == vj) {
67: printf(" error: duplicate entry in An"); break;
68: }
69: nzk++;
70: q[m] = vj;
71: q[vj] = qm;
72: } /* if(vj > k) */
73: } /* for (j=jmin; j<jmax; j++) */
75: /* modify nonzero structure of k-th row by computing fill-in
76: for each row i to be merged in */
77: i = k;
78: i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
79: /* printf(" next pivot row i=%dn",i); */
80: while (i < mbs){
81: /* merge row i into k-th row */
82: nzi = iu[i+1] - (iu[i]+1);
83: jmin = iu[i] + 1; jmax = iu[i] + nzi;
84: qm = k;
85: for (j=jmin; j<jmax+1; j++){
86: vj = ju[j];
87: do {
88: m = qm; qm = q[m];
89: } while (qm < vj);
90: if (qm != vj){
91: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
92: }
93: }
94: i = jl[i]; /* next pivot row */
95: }
96:
97: /* add k to row list for first nonzero element in k-th row */
98: if (nzk > 0){
99: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
100: jl[k] = jl[i]; jl[i] = k;
101: }
102: iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%dn", k+1, iu[k+1],umax);*/
104: /* allocate more space to ju if needed */
105: if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%dn",k+1, iu[k+1],umax);
106: /* estimate how much additional space we will need */
107: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
108: /* just double the memory each time */
109: maxadd = umax;
110: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
111: umax += maxadd;
113: /* allocate a longer ju */
114: PetscMalloc(umax*sizeof(int),&jutmp);
115: ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
116: ierr = PetscFree(ju);
117: ju = jutmp;
118: realloc++; /* count how many times we realloc */
119: }
121: /* save nonzero structure of k-th row in ju */
122: i=k;
123: jumin = juptr + 1; juptr += nzk;
124: for (j=jumin; j<juptr+1; j++){
125: i=q[i];
126: ju[j]=i;
127: }
128: }
130: if (ai[mbs] != 0) {
131: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
132: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
133: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use n",af);
134: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);n",af);
135: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.n");
136: } else {
137: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.n");
138: }
140: ISRestoreIndices(perm,&rip);
141: PetscFree(q);
142: PetscFree(jl);
144: /* put together the new matrix */
145: MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
146: /* PetscLogObjectParent(*B,iperm); */
147: b = (Mat_SeqSBAIJ*)(*B)->data;
148: PetscFree(b->imax);
149: b->singlemalloc = PETSC_FALSE;
150: /* the next line frees the default space generated by the Create() */
151: PetscFree(b->a);
152: PetscFree(b->ilen);
153: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
154: b->j = ju;
155: b->i = iu;
156: b->diag = 0;
157: b->ilen = 0;
158: b->imax = 0;
159: b->row = perm;
160: b->pivotinblocks = PETSC_FALSE; /* need to get from MatCholeskyInfo */
161: ierr = PetscObjectReference((PetscObject)perm);
162: b->icol = perm;
163: ierr = PetscObjectReference((PetscObject)perm);
164: ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
165: /* In b structure: Free imax, ilen, old a, old j.
166: Allocate idnew, solve_work, new a, new j */
167: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
168: b->s_maxnz = b->s_nz = iu[mbs];
169:
170: (*B)->factor = FACTOR_CHOLESKY;
171: (*B)->info.factor_mallocs = realloc;
172: (*B)->info.fill_ratio_given = f;
173: if (ai[mbs] != 0) {
174: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
175: } else {
176: (*B)->info.fill_ratio_needed = 0.0;
177: }
179: if (perm_identity){
180: switch (bs) {
181: case 1:
182: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
183: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
184: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1n");
185: break;
186: case 2:
187: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
188: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
189: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2n");
190: break;
191: case 3:
192: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
193: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
194: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3n");
195: break;
196: case 4:
197: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
198: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
199: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4n");
200: break;
201: case 5:
202: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
203: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
204: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5n");
205: break;
206: case 6:
207: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
208: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
209: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6n");
210: break;
211: case 7:
212: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
213: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
214: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7n");
215: break;
216: default:
217: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
218: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
219: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7n");
220: break;
221: }
222: }
224: return(0);
225: }
228: int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
229: {
230: Mat C = *B;
231: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
232: IS perm = b->row;
233: int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
234: int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
235: int bs=a->bs,bs2 = a->bs2;
236: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
237: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
238: MatScalar *work;
239: int *pivots;
243: /* initialization */
244: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
245: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
246: PetscMalloc(2*mbs*sizeof(int),&il);
247: jl = il + mbs;
248: for (i=0; i<mbs; i++) {
249: jl[i] = mbs; il[0] = 0;
250: }
251: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
252: uik = dk + bs2;
253: work = uik + bs2;
254: PetscMalloc(bs*sizeof(int),&pivots);
255:
256: ierr = ISGetIndices(perm,&perm_ptr);
257:
258: /* check permutation */
259: if (!a->permute){
260: ai = a->i; aj = a->j; aa = a->a;
261: } else {
262: ai = a->inew; aj = a->jnew;
263: PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);
264: PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));
265: PetscMalloc(ai[mbs]*sizeof(int),&a2anew);
266: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));
268: for (i=0; i<mbs; i++){
269: jmin = ai[i]; jmax = ai[i+1];
270: for (j=jmin; j<jmax; j++){
271: while (a2anew[j] != j){
272: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
273: for (k1=0; k1<bs2; k1++){
274: dk[k1] = aa[k*bs2+k1];
275: aa[k*bs2+k1] = aa[j*bs2+k1];
276: aa[j*bs2+k1] = dk[k1];
277: }
278: }
279: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
280: if (i > aj[j]){
281: /* printf("change orientation, row: %d, col: %dn",i,aj[j]); */
282: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
283: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
284: for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */
285: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
286: }
287: }
288: }
289: }
290: PetscFree(a2anew);
291: }
292:
293: /* for each row k */
294: for (k = 0; k<mbs; k++){
296: /*initialize k-th row with elements nonzero in row perm(k) of A */
297: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
298: if (jmin < jmax) {
299: ap = aa + jmin*bs2;
300: for (j = jmin; j < jmax; j++){
301: vj = perm_ptr[aj[j]]; /* block col. index */
302: rtmp_ptr = rtmp + vj*bs2;
303: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
304: }
305: }
307: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
308: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
309: i = jl[k]; /* first row to be added to k_th row */
311: while (i < mbs){
312: nexti = jl[i]; /* next row to be added to k_th row */
314: /* compute multiplier */
315: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
317: /* uik = -inv(Di)*U_bar(i,k) */
318: diag = ba + i*bs2;
319: u = ba + ili*bs2;
320: PetscMemzero(uik,bs2*sizeof(MatScalar));
321: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
322:
323: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
324: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
325:
326: /* update -U(i,k) */
327: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
329: /* add multiple of row i to k-th row ... */
330: jmin = ili + 1; jmax = bi[i+1];
331: if (jmin < jmax){
332: for (j=jmin; j<jmax; j++) {
333: /* rtmp += -U(i,k)^T * U_bar(i,j) */
334: rtmp_ptr = rtmp + bj[j]*bs2;
335: u = ba + j*bs2;
336: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
337: }
338:
339: /* ... add i to row list for next nonzero entry */
340: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
341: j = bj[jmin];
342: jl[i] = jl[j]; jl[j] = i; /* update jl */
343: }
344: i = nexti;
345: }
347: /* save nonzero entries in k-th row of U ... */
349: /* invert diagonal block */
350: diag = ba+k*bs2;
351: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
352: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
353:
354: jmin = bi[k]; jmax = bi[k+1];
355: if (jmin < jmax) {
356: for (j=jmin; j<jmax; j++){
357: vj = bj[j]; /* block col. index of U */
358: u = ba + j*bs2;
359: rtmp_ptr = rtmp + vj*bs2;
360: for (k1=0; k1<bs2; k1++){
361: *u++ = *rtmp_ptr;
362: *rtmp_ptr++ = 0.0;
363: }
364: }
365:
366: /* ... add k to row list for first nonzero entry in k-th row */
367: il[k] = jmin;
368: i = bj[jmin];
369: jl[k] = jl[i]; jl[i] = k;
370: }
371: }
373: PetscFree(rtmp);
374: PetscFree(il);
375: PetscFree(dk);
376: PetscFree(pivots);
377: if (a->permute){
378: PetscFree(aa);
379: }
381: ISRestoreIndices(perm,&perm_ptr);
382: C->factor = FACTOR_CHOLESKY;
383: C->assembled = PETSC_TRUE;
384: C->preallocated = PETSC_TRUE;
385: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
386: return(0);
387: }
389: int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
390: {
391: Mat C = *B;
392: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
393: int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
394: int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
395: int bs=a->bs,bs2 = a->bs2;
396: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
397: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
398: MatScalar *work;
399: int *pivots;
403: /* initialization */
404:
405: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
406: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
407: PetscMalloc(2*mbs*sizeof(int),&il);
408: jl = il + mbs;
409: for (i=0; i<mbs; i++) {
410: jl[i] = mbs; il[0] = 0;
411: }
412: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
413: uik = dk + bs2;
414: work = uik + bs2;
415: PetscMalloc(bs*sizeof(int),&pivots);
416:
417: ai = a->i; aj = a->j; aa = a->a;
418:
419: /* for each row k */
420: for (k = 0; k<mbs; k++){
422: /*initialize k-th row with elements nonzero in row k of A */
423: jmin = ai[k]; jmax = ai[k+1];
424: if (jmin < jmax) {
425: ap = aa + jmin*bs2;
426: for (j = jmin; j < jmax; j++){
427: vj = aj[j]; /* block col. index */
428: rtmp_ptr = rtmp + vj*bs2;
429: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
430: }
431: }
433: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
434: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
435: i = jl[k]; /* first row to be added to k_th row */
437: while (i < mbs){
438: nexti = jl[i]; /* next row to be added to k_th row */
440: /* compute multiplier */
441: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
443: /* uik = -inv(Di)*U_bar(i,k) */
444: diag = ba + i*bs2;
445: u = ba + ili*bs2;
446: PetscMemzero(uik,bs2*sizeof(MatScalar));
447: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
448:
449: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
450: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
451:
452: /* update -U(i,k) */
453: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
455: /* add multiple of row i to k-th row ... */
456: jmin = ili + 1; jmax = bi[i+1];
457: if (jmin < jmax){
458: for (j=jmin; j<jmax; j++) {
459: /* rtmp += -U(i,k)^T * U_bar(i,j) */
460: rtmp_ptr = rtmp + bj[j]*bs2;
461: u = ba + j*bs2;
462: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
463: }
464:
465: /* ... add i to row list for next nonzero entry */
466: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
467: j = bj[jmin];
468: jl[i] = jl[j]; jl[j] = i; /* update jl */
469: }
470: i = nexti;
471: }
473: /* save nonzero entries in k-th row of U ... */
475: /* invert diagonal block */
476: diag = ba+k*bs2;
477: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
478: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
479:
480: jmin = bi[k]; jmax = bi[k+1];
481: if (jmin < jmax) {
482: for (j=jmin; j<jmax; j++){
483: vj = bj[j]; /* block col. index of U */
484: u = ba + j*bs2;
485: rtmp_ptr = rtmp + vj*bs2;
486: for (k1=0; k1<bs2; k1++){
487: *u++ = *rtmp_ptr;
488: *rtmp_ptr++ = 0.0;
489: }
490: }
491:
492: /* ... add k to row list for first nonzero entry in k-th row */
493: il[k] = jmin;
494: i = bj[jmin];
495: jl[k] = jl[i]; jl[i] = k;
496: }
497: }
499: PetscFree(rtmp);
500: PetscFree(il);
501: PetscFree(dk);
502: PetscFree(pivots);
504: C->factor = FACTOR_CHOLESKY;
505: C->assembled = PETSC_TRUE;
506: C->preallocated = PETSC_TRUE;
507: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
508: return(0);
509: }
511: /*
512: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
513: Version for blocks 2 by 2.
514: */
515: int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
516: {
517: Mat C = *B;
518: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
519: IS perm = b->row;
520: int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
521: int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
522: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
523: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
526:
527: /* initialization */
528: /* il and jl record the first nonzero element in each row of the accessing
529: window U(0:k, k:mbs-1).
530: jl: list of rows to be added to uneliminated rows
531: i>= k: jl(i) is the first row to be added to row i
532: i< k: jl(i) is the row following row i in some list of rows
533: jl(i) = mbs indicates the end of a list
534: il(i): points to the first nonzero element in columns k,...,mbs-1 of
535: row i of U */
536: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
537: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
538: PetscMalloc(2*mbs*sizeof(int),&il);
539: jl = il + mbs;
540: for (i=0; i<mbs; i++) {
541: jl[i] = mbs; il[0] = 0;
542: }
543: PetscMalloc(8*sizeof(MatScalar),&dk);
544: uik = dk + 4;
545: ISGetIndices(perm,&perm_ptr);
547: /* check permutation */
548: if (!a->permute){
549: ai = a->i; aj = a->j; aa = a->a;
550: } else {
551: ai = a->inew; aj = a->jnew;
552: PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);
553: PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));
554: PetscMalloc(ai[mbs]*sizeof(int),&a2anew);
555: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));
557: for (i=0; i<mbs; i++){
558: jmin = ai[i]; jmax = ai[i+1];
559: for (j=jmin; j<jmax; j++){
560: while (a2anew[j] != j){
561: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
562: for (k1=0; k1<4; k1++){
563: dk[k1] = aa[k*4+k1];
564: aa[k*4+k1] = aa[j*4+k1];
565: aa[j*4+k1] = dk[k1];
566: }
567: }
568: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
569: if (i > aj[j]){
570: /* printf("change orientation, row: %d, col: %dn",i,aj[j]); */
571: ap = aa + j*4; /* ptr to the beginning of the block */
572: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
573: ap[1] = ap[2];
574: ap[2] = dk[1];
575: }
576: }
577: }
578: PetscFree(a2anew);
579: }
581: /* for each row k */
582: for (k = 0; k<mbs; k++){
584: /*initialize k-th row with elements nonzero in row perm(k) of A */
585: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
586: if (jmin < jmax) {
587: ap = aa + jmin*4;
588: for (j = jmin; j < jmax; j++){
589: vj = perm_ptr[aj[j]]; /* block col. index */
590: rtmp_ptr = rtmp + vj*4;
591: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
592: }
593: }
595: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
596: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
597: i = jl[k]; /* first row to be added to k_th row */
599: while (i < mbs){
600: nexti = jl[i]; /* next row to be added to k_th row */
602: /* compute multiplier */
603: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
605: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
606: diag = ba + i*4;
607: u = ba + ili*4;
608: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
609: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
610: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
611: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
612:
613: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
614: dk[0] += uik[0]*u[0] + uik[1]*u[1];
615: dk[1] += uik[2]*u[0] + uik[3]*u[1];
616: dk[2] += uik[0]*u[2] + uik[1]*u[3];
617: dk[3] += uik[2]*u[2] + uik[3]*u[3];
619: /* update -U(i,k): ba[ili] = uik */
620: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
622: /* add multiple of row i to k-th row ... */
623: jmin = ili + 1; jmax = bi[i+1];
624: if (jmin < jmax){
625: for (j=jmin; j<jmax; j++) {
626: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
627: rtmp_ptr = rtmp + bj[j]*4;
628: u = ba + j*4;
629: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
630: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
631: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
632: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
633: }
634:
635: /* ... add i to row list for next nonzero entry */
636: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
637: j = bj[jmin];
638: jl[i] = jl[j]; jl[j] = i; /* update jl */
639: }
640: i = nexti;
641: }
643: /* save nonzero entries in k-th row of U ... */
645: /* invert diagonal block */
646: diag = ba+k*4;
647: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
648: Kernel_A_gets_inverse_A_2(diag);
649:
650: jmin = bi[k]; jmax = bi[k+1];
651: if (jmin < jmax) {
652: for (j=jmin; j<jmax; j++){
653: vj = bj[j]; /* block col. index of U */
654: u = ba + j*4;
655: rtmp_ptr = rtmp + vj*4;
656: for (k1=0; k1<4; k1++){
657: *u++ = *rtmp_ptr;
658: *rtmp_ptr++ = 0.0;
659: }
660: }
661:
662: /* ... add k to row list for first nonzero entry in k-th row */
663: il[k] = jmin;
664: i = bj[jmin];
665: jl[k] = jl[i]; jl[i] = k;
666: }
667: }
669: PetscFree(rtmp);
670: PetscFree(il);
671: PetscFree(dk);
672: if (a->permute) {
673: PetscFree(aa);
674: }
675: ISRestoreIndices(perm,&perm_ptr);
676: C->factor = FACTOR_CHOLESKY;
677: C->assembled = PETSC_TRUE;
678: C->preallocated = PETSC_TRUE;
679: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
680: return(0);
681: }
683: /*
684: Version for when blocks are 2 by 2 Using natural ordering
685: */
686: int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
687: {
688: Mat C = *B;
689: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
690: int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
691: int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
692: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
693: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
696:
697: /* initialization */
698: /* il and jl record the first nonzero element in each row of the accessing
699: window U(0:k, k:mbs-1).
700: jl: list of rows to be added to uneliminated rows
701: i>= k: jl(i) is the first row to be added to row i
702: i< k: jl(i) is the row following row i in some list of rows
703: jl(i) = mbs indicates the end of a list
704: il(i): points to the first nonzero element in columns k,...,mbs-1 of
705: row i of U */
706: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
707: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
708: PetscMalloc(2*mbs*sizeof(int),&il);
709: jl = il + mbs;
710: for (i=0; i<mbs; i++) {
711: jl[i] = mbs; il[0] = 0;
712: }
713: PetscMalloc(8*sizeof(MatScalar),&dk);
714: uik = dk + 4;
715:
716: ai = a->i; aj = a->j; aa = a->a;
718: /* for each row k */
719: for (k = 0; k<mbs; k++){
721: /*initialize k-th row with elements nonzero in row k of A */
722: jmin = ai[k]; jmax = ai[k+1];
723: if (jmin < jmax) {
724: ap = aa + jmin*4;
725: for (j = jmin; j < jmax; j++){
726: vj = aj[j]; /* block col. index */
727: rtmp_ptr = rtmp + vj*4;
728: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
729: }
730: }
732: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
733: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
734: i = jl[k]; /* first row to be added to k_th row */
736: while (i < mbs){
737: nexti = jl[i]; /* next row to be added to k_th row */
739: /* compute multiplier */
740: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
742: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
743: diag = ba + i*4;
744: u = ba + ili*4;
745: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
746: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
747: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
748: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
749:
750: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
751: dk[0] += uik[0]*u[0] + uik[1]*u[1];
752: dk[1] += uik[2]*u[0] + uik[3]*u[1];
753: dk[2] += uik[0]*u[2] + uik[1]*u[3];
754: dk[3] += uik[2]*u[2] + uik[3]*u[3];
756: /* update -U(i,k): ba[ili] = uik */
757: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
759: /* add multiple of row i to k-th row ... */
760: jmin = ili + 1; jmax = bi[i+1];
761: if (jmin < jmax){
762: for (j=jmin; j<jmax; j++) {
763: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
764: rtmp_ptr = rtmp + bj[j]*4;
765: u = ba + j*4;
766: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
767: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
768: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
769: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
770: }
771:
772: /* ... add i to row list for next nonzero entry */
773: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
774: j = bj[jmin];
775: jl[i] = jl[j]; jl[j] = i; /* update jl */
776: }
777: i = nexti;
778: }
780: /* save nonzero entries in k-th row of U ... */
782: /* invert diagonal block */
783: diag = ba+k*4;
784: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
785: Kernel_A_gets_inverse_A_2(diag);
786:
787: jmin = bi[k]; jmax = bi[k+1];
788: if (jmin < jmax) {
789: for (j=jmin; j<jmax; j++){
790: vj = bj[j]; /* block col. index of U */
791: u = ba + j*4;
792: rtmp_ptr = rtmp + vj*4;
793: for (k1=0; k1<4; k1++){
794: *u++ = *rtmp_ptr;
795: *rtmp_ptr++ = 0.0;
796: }
797: }
798:
799: /* ... add k to row list for first nonzero entry in k-th row */
800: il[k] = jmin;
801: i = bj[jmin];
802: jl[k] = jl[i]; jl[i] = k;
803: }
804: }
806: PetscFree(rtmp);
807: PetscFree(il);
808: PetscFree(dk);
810: C->factor = FACTOR_CHOLESKY;
811: C->assembled = PETSC_TRUE;
812: C->preallocated = PETSC_TRUE;
813: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
814: return(0);
815: }
817: /*
818: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
819: Version for blocks are 1 by 1.
820: */
821: int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
822: {
823: Mat C = *B;
824: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
825: IS ip = b->row;
826: int *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
827: int *ai,*aj,*r;
828: int k,jmin,jmax,*jl,*il,vj,nexti,ili;
829: MatScalar *rtmp;
830: MatScalar *ba = b->a,*aa,ak;
831: MatScalar dk,uikdi;
834: ierr = ISGetIndices(ip,&rip);
835: if (!a->permute){
836: ai = a->i; aj = a->j; aa = a->a;
837: } else {
838: ai = a->inew; aj = a->jnew;
839: PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);
840: PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));
841: PetscMalloc(ai[mbs]*sizeof(int),&r);
842: ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));
844: jmin = ai[0]; jmax = ai[mbs];
845: for (j=jmin; j<jmax; j++){
846: while (r[j] != j){
847: k = r[j]; r[j] = r[k]; r[k] = k;
848: ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
849: }
850: }
851: PetscFree(r);
852: }
853:
854: /* initialization */
855: /* il and jl record the first nonzero element in each row of the accessing
856: window U(0:k, k:mbs-1).
857: jl: list of rows to be added to uneliminated rows
858: i>= k: jl(i) is the first row to be added to row i
859: i< k: jl(i) is the row following row i in some list of rows
860: jl(i) = mbs indicates the end of a list
861: il(i): points to the first nonzero element in columns k,...,mbs-1 of
862: row i of U */
863: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
864: PetscMalloc(2*mbs*sizeof(int),&il);
865: jl = il + mbs;
866: for (i=0; i<mbs; i++) {
867: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
868: }
870: /* for each row k */
871: for (k = 0; k<mbs; k++){
873: /*initialize k-th row with elements nonzero in row perm(k) of A */
874: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
875: if (jmin < jmax) {
876: for (j = jmin; j < jmax; j++){
877: vj = rip[aj[j]];
878: rtmp[vj] = aa[j];
879: }
880: }
882: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
883: dk = rtmp[k];
884: i = jl[k]; /* first row to be added to k_th row */
886: while (i < mbs){
887: nexti = jl[i]; /* next row to be added to k_th row */
889: /* compute multiplier, update D(k) and U(i,k) */
890: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
891: uikdi = - ba[ili]*ba[i];
892: dk += uikdi*ba[ili];
893: ba[ili] = uikdi; /* -U(i,k) */
895: /* add multiple of row i to k-th row ... */
896: jmin = ili + 1; jmax = bi[i+1];
897: if (jmin < jmax){
898: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
899: /* ... add i to row list for next nonzero entry */
900: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
901: j = bj[jmin];
902: jl[i] = jl[j]; jl[j] = i; /* update jl */
903: }
904: i = nexti;
905: }
907: /* check for zero pivot and save diagoanl element */
908: if (dk == 0.0){
909: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
910: /*
911: }else if (PetscRealPart(dk) < 0){
912: PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %gn",k,dk);
913: */
914: }
916: /* save nonzero entries in k-th row of U ... */
917: ba[k] = 1.0/dk;
918: jmin = bi[k]; jmax = bi[k+1];
919: if (jmin < jmax) {
920: for (j=jmin; j<jmax; j++){
921: vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
922: }
923: /* ... add k to row list for first nonzero entry in k-th row */
924: il[k] = jmin;
925: i = bj[jmin];
926: jl[k] = jl[i]; jl[i] = k;
927: }
928: }
929:
930: PetscFree(rtmp);
931: PetscFree(il);
932: if (a->permute){
933: PetscFree(aa);
934: }
936: ISRestoreIndices(ip,&rip);
937: C->factor = FACTOR_CHOLESKY;
938: C->assembled = PETSC_TRUE;
939: C->preallocated = PETSC_TRUE;
940: PetscLogFlops(b->mbs);
941: return(0);
942: }
944: /*
945: Version for when blocks are 1 by 1 Using natural ordering
946: */
947: int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
948: {
949: Mat C = *B;
950: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
951: int ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
952: int *ai,*aj;
953: int k,jmin,jmax,*jl,*il,vj,nexti,ili;
954: MatScalar *rtmp,*ba = b->a,*aa,dk,uikdi;
957:
958: /* initialization */
959: /* il and jl record the first nonzero element in each row of the accessing
960: window U(0:k, k:mbs-1).
961: jl: list of rows to be added to uneliminated rows
962: i>= k: jl(i) is the first row to be added to row i
963: i< k: jl(i) is the row following row i in some list of rows
964: jl(i) = mbs indicates the end of a list
965: il(i): points to the first nonzero element in columns k,...,mbs-1 of
966: row i of U */
967: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
968: PetscMalloc(2*mbs*sizeof(int),&il);
969: jl = il + mbs;
970: for (i=0; i<mbs; i++) {
971: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
972: }
974: ai = a->i; aj = a->j; aa = a->a;
976: /* for each row k */
977: for (k = 0; k<mbs; k++){
979: /*initialize k-th row with elements nonzero in row perm(k) of A */
980: jmin = ai[k]; jmax = ai[k+1];
981: if (jmin < jmax) {
982: for (j = jmin; j < jmax; j++){
983: vj = aj[j];
984: rtmp[vj] = aa[j];
985: }
986: }
988: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
989: dk = rtmp[k];
990: i = jl[k]; /* first row to be added to k_th row */
992: while (i < mbs){
993: nexti = jl[i]; /* next row to be added to k_th row */
995: /* compute multiplier, update D(k) and U(i,k) */
996: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
997: uikdi = - ba[ili]*ba[i];
998: dk += uikdi*ba[ili];
999: ba[ili] = uikdi; /* -U(i,k) */
1001: /* add multiple of row i to k-th row ... */
1002: jmin = ili + 1; jmax = bi[i+1];
1003: if (jmin < jmax){
1004: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1005: /* ... add i to row list for next nonzero entry */
1006: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1007: j = bj[jmin];
1008: jl[i] = jl[j]; jl[j] = i; /* update jl */
1009: }
1010: i = nexti;
1011: }
1013: /* check for zero pivot and save diagoanl element */
1014: if (dk == 0.0){
1015: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1016: /*
1017: }else if (PetscRealPart(dk) < 0){
1018: PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %gn",k,dk);
1019: */
1020: }
1022: /* save nonzero entries in k-th row of U ... */
1023: ba[k] = 1.0/dk;
1024: jmin = bi[k]; jmax = bi[k+1];
1025: if (jmin < jmax) {
1026: for (j=jmin; j<jmax; j++){
1027: vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1028: }
1029: /* ... add k to row list for first nonzero entry in k-th row */
1030: il[k] = jmin;
1031: i = bj[jmin];
1032: jl[k] = jl[i]; jl[i] = k;
1033: }
1034: }
1035:
1036: PetscFree(rtmp);
1037: PetscFree(il);
1038:
1039: C->factor = FACTOR_CHOLESKY;
1040: C->assembled = PETSC_TRUE;
1041: C->preallocated = PETSC_TRUE;
1042: PetscLogFlops(b->mbs);
1043: return(0);
1044: }
1046: int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
1047: {
1049: Mat C;
1052: MatCholeskyFactorSymbolic(A,perm,f,&C);
1053: MatCholeskyFactorNumeric(A,&C);
1054: MatHeaderCopy(A,C);
1055: return(0);
1056: }