Actual source code: sbaijfact.c
1: #define PETSCMAT_DLL
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/mat/impls/sbaij/seq/sbaij.h
5: #include src/inline/ilu.h
6: #include include/petscis.h
8: #if !defined(PETSC_USE_COMPLEX)
9: /*
10: input:
11: F -- numeric factor
12: output:
13: nneg, nzero, npos: matrix inertia
14: */
18: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
19: {
20: Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
21: PetscScalar *dd = fact_ptr->a;
22: PetscInt mbs=fact_ptr->mbs,bs=F->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;
25: if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
26: nneig_tmp = 0; npos_tmp = 0;
27: for (i=0; i<mbs; i++){
28: if (PetscRealPart(dd[*fi]) > 0.0){
29: npos_tmp++;
30: } else if (PetscRealPart(dd[*fi]) < 0.0){
31: nneig_tmp++;
32: }
33: fi++;
34: }
35: if (nneig) *nneig = nneig_tmp;
36: if (npos) *npos = npos_tmp;
37: if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;
39: return(0);
40: }
41: #endif /* !defined(PETSC_USE_COMPLEX) */
43: /*
44: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
45: Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad.
46: */
49: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
50: {
51: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
53: PetscInt *rip,i,mbs = a->mbs,*ai,*aj;
54: PetscInt *jutmp,bs = A->bs,bs2=a->bs2;
55: PetscInt m,reallocs = 0,prow;
56: PetscInt *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
57: PetscReal f = info->fill;
58: PetscTruth perm_identity;
61: /* check whether perm is the identity mapping */
62: ISIdentity(perm,&perm_identity);
63: ISGetIndices(perm,&rip);
64:
65: if (perm_identity){ /* without permutation */
66: a->permute = PETSC_FALSE;
67: ai = a->i; aj = a->j;
68: } else { /* non-trivial permutation */
69: a->permute = PETSC_TRUE;
70: MatReorderingSeqSBAIJ(A,perm);
71: ai = a->inew; aj = a->jnew;
72: }
73:
74: /* initialization */
75: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
76: umax = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
77: PetscMalloc(umax*sizeof(PetscInt),&ju);
78: iu[0] = mbs+1;
79: juidx = mbs + 1; /* index for ju */
80: PetscMalloc(2*mbs*sizeof(PetscInt),&jl); /* linked list for pivot row */
81: q = jl + mbs; /* linked list for col index */
82: for (i=0; i<mbs; i++){
83: jl[i] = mbs;
84: q[i] = 0;
85: }
87: /* for each row k */
88: for (k=0; k<mbs; k++){
89: for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */
90: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
91: q[k] = mbs;
92: /* initialize nonzero structure of k-th row to row rip[k] of A */
93: jmin = ai[rip[k]] +1; /* exclude diag[k] */
94: jmax = ai[rip[k]+1];
95: for (j=jmin; j<jmax; j++){
96: vj = rip[aj[j]]; /* col. value */
97: if(vj > k){
98: qm = k;
99: do {
100: m = qm; qm = q[m];
101: } while(qm < vj);
102: if (qm == vj) {
103: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
104: }
105: nzk++;
106: q[m] = vj;
107: q[vj] = qm;
108: } /* if(vj > k) */
109: } /* for (j=jmin; j<jmax; j++) */
111: /* modify nonzero structure of k-th row by computing fill-in
112: for each row i to be merged in */
113: prow = k;
114: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
115:
116: while (prow < k){
117: /* merge row prow into k-th row */
118: jmin = iu[prow] + 1; jmax = iu[prow+1];
119: qm = k;
120: for (j=jmin; j<jmax; j++){
121: vj = ju[j];
122: do {
123: m = qm; qm = q[m];
124: } while (qm < vj);
125: if (qm != vj){
126: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
127: }
128: }
129: prow = jl[prow]; /* next pivot row */
130: }
131:
132: /* add k to row list for first nonzero element in k-th row */
133: if (nzk > 0){
134: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
135: jl[k] = jl[i]; jl[i] = k;
136: }
137: iu[k+1] = iu[k] + nzk;
139: /* allocate more space to ju if needed */
140: if (iu[k+1] > umax) {
141: /* estimate how much additional space we will need */
142: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143: /* just double the memory each time */
144: maxadd = umax;
145: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
146: umax += maxadd;
148: /* allocate a longer ju */
149: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
150: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
151: PetscFree(ju);
152: ju = jutmp;
153: reallocs++; /* count how many times we realloc */
154: }
156: /* save nonzero structure of k-th row in ju */
157: i=k;
158: while (nzk --) {
159: i = q[i];
160: ju[juidx++] = i;
161: }
162: }
164: #if defined(PETSC_USE_DEBUG)
165: if (ai[mbs] != 0) {
166: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
167: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af));
168: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af));
169: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af));
170: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n"));
171: } else {
172: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"));
173: }
174: #endif
176: ISRestoreIndices(perm,&rip);
177: PetscFree(jl);
179: /* put together the new matrix */
180: MatCreate(A->comm,B);
181: MatSetSizes(*B,bs*mbs,bs*mbs,bs*mbs,bs*mbs);
182: MatSetType(*B,A->type_name);
183: MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
185: /* PetscLogObjectParent(*B,iperm); */
186: b = (Mat_SeqSBAIJ*)(*B)->data;
187: b->singlemalloc = PETSC_FALSE;
188: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
189: b->j = ju;
190: b->i = iu;
191: b->diag = 0;
192: b->ilen = 0;
193: b->imax = 0;
194: b->row = perm;
195: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196: PetscObjectReference((PetscObject)perm);
197: b->icol = perm;
198: PetscObjectReference((PetscObject)perm);
199: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
200: /* In b structure: Free imax, ilen, old a, old j.
201: Allocate idnew, solve_work, new a, new j */
202: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
203: b->maxnz = b->nz = iu[mbs];
204:
205: (*B)->factor = FACTOR_CHOLESKY;
206: (*B)->info.factor_mallocs = reallocs;
207: (*B)->info.fill_ratio_given = f;
208: if (ai[mbs] != 0) {
209: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
210: } else {
211: (*B)->info.fill_ratio_needed = 0.0;
212: }
214: if (perm_identity){
215: switch (bs) {
216: case 1:
217: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
218: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
219: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"));
220: break;
221: case 2:
222: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
223: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
224: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"));
225: break;
226: case 3:
227: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
228: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
229: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"));
230: break;
231: case 4:
232: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
233: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
234: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"));
235: break;
236: case 5:
237: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
238: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
239: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"));
240: break;
241: case 6:
242: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
243: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
244: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"));
245: break;
246: case 7:
247: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
248: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
249: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"));
250: break;
251: default:
252: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
253: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
254: PetscLogInfo((A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"));
255: break;
256: }
257: }
258: return(0);
259: }
260: /*
261: Symbolic U^T*D*U factorization for SBAIJ format.
262: */
263: #include petscbt.h
264: #include src/mat/utils/freespace.h
267: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
268: {
269: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
270: Mat_SeqSBAIJ *b;
271: Mat B;
273: PetscTruth perm_identity;
274: PetscReal fill = info->fill;
275: PetscInt *rip,i,mbs=a->mbs,bs=A->bs,*ai,*aj,reallocs=0,prow;
276: PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
277: PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
278: FreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
279: PetscBT lnkbt;
282: /*
283: This code originally uses Modified Sparse Row (MSR) storage
284: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
285: Then it is rewritten so the factor B takes seqsbaij format. However the associated
286: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity,
287: thus the original code in MSR format is still used for these cases.
288: The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever
289: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
290: */
291: if (bs > 1){
292: MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);
293: return(0);
294: }
296: /* check whether perm is the identity mapping */
297: ISIdentity(perm,&perm_identity);
299: if (perm_identity){
300: a->permute = PETSC_FALSE;
301: ai = a->i; aj = a->j;
302: } else {
303: a->permute = PETSC_TRUE;
304: MatReorderingSeqSBAIJ(A,perm);
305: ai = a->inew; aj = a->jnew;
306: }
307: ISGetIndices(perm,&rip);
309: /* initialization */
310: PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);
311: ui[0] = 0;
313: /* jl: linked list for storing indices of the pivot rows
314: il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
315: PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);
316: il = jl + mbs;
317: cols = il + mbs;
318: ui_ptr = (PetscInt**)(cols + mbs);
319:
320: for (i=0; i<mbs; i++){
321: jl[i] = mbs; il[i] = 0;
322: }
324: /* create and initialize a linked list for storing column indices of the active row k */
325: nlnk = mbs + 1;
326: PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);
328: /* initial FreeSpace size is fill*(ai[mbs]+1) */
329: GetMoreSpace((PetscInt)(fill*(ai[mbs]+1)),&free_space);
330: current_space = free_space;
332: for (k=0; k<mbs; k++){ /* for each active row k */
333: /* initialize lnk by the column indices of row rip[k] of A */
334: nzk = 0;
335: ncols = ai[rip[k]+1] - ai[rip[k]];
336: for (j=0; j<ncols; j++){
337: i = *(aj + ai[rip[k]] + j);
338: cols[j] = rip[i];
339: }
340: PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);
341: nzk += nlnk;
343: /* update lnk by computing fill-in for each pivot row to be merged in */
344: prow = jl[k]; /* 1st pivot row */
345:
346: while (prow < k){
347: nextprow = jl[prow];
348: /* merge prow into k-th row */
349: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
350: jmax = ui[prow+1];
351: ncols = jmax-jmin;
352: uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
353: PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);
354: nzk += nlnk;
356: /* update il and jl for prow */
357: if (jmin < jmax){
358: il[prow] = jmin;
359: j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
360: }
361: prow = nextprow;
362: }
364: /* if free space is not available, make more free space */
365: if (current_space->local_remaining<nzk) {
366: i = mbs - k + 1; /* num of unfactored rows */
367: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
368: GetMoreSpace(i,¤t_space);
369: reallocs++;
370: }
372: /* copy data into free space, then initialize lnk */
373: PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);
375: /* add the k-th row into il and jl */
376: if (nzk-1 > 0){
377: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
378: jl[k] = jl[i]; jl[i] = k;
379: il[k] = ui[k] + 1;
380: }
381: ui_ptr[k] = current_space->array;
382: current_space->array += nzk;
383: current_space->local_used += nzk;
384: current_space->local_remaining -= nzk;
386: ui[k+1] = ui[k] + nzk;
387: }
389: #if defined(PETSC_USE_DEBUG)
390: if (ai[mbs] != 0) {
391: PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
392: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,fill,af));
393: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af));
394: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g) for best performance.\n",af));
395: } else {
396: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n"));
397: }
398: #endif
400: ISRestoreIndices(perm,&rip);
401: PetscFree(jl);
403: /* destroy list of free space and other temporary array(s) */
404: PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);
405: MakeSpaceContiguous(&free_space,uj);
406: PetscLLDestroy(lnk,lnkbt);
408: /* put together the new matrix in MATSEQSBAIJ format */
409: MatCreate(PETSC_COMM_SELF,fact);
410: MatSetSizes(*fact,mbs,mbs,mbs,mbs);
411: B = *fact;
412: MatSetType(B,MATSEQSBAIJ);
413: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
415: b = (Mat_SeqSBAIJ*)B->data;
416: b->singlemalloc = PETSC_FALSE;
417: PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);
418: b->j = uj;
419: b->i = ui;
420: b->diag = 0;
421: b->ilen = 0;
422: b->imax = 0;
423: b->row = perm;
424: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
425: PetscObjectReference((PetscObject)perm);
426: b->icol = perm;
427: PetscObjectReference((PetscObject)perm);
428: PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);
429: PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
430: b->maxnz = b->nz = ui[mbs];
431:
432: B->factor = FACTOR_CHOLESKY;
433: B->info.factor_mallocs = reallocs;
434: B->info.fill_ratio_given = fill;
435: if (ai[mbs] != 0) {
436: B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
437: } else {
438: B->info.fill_ratio_needed = 0.0;
439: }
441: if (perm_identity){
442: switch (bs) {
443: case 1:
444: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
445: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
446: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n"));
447: break;
448: case 2:
449: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
450: B->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
451: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n"));
452: break;
453: case 3:
454: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
455: B->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
456: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n"));
457: break;
458: case 4:
459: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
460: B->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
461: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n"));
462: break;
463: case 5:
464: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
465: B->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
466: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n"));
467: break;
468: case 6:
469: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
470: B->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
471: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n"));
472: break;
473: case 7:
474: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
475: B->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
476: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n"));
477: break;
478: default:
479: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
480: B->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
481: PetscLogInfo((A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n"));
482: break;
483: }
484: }
485: return(0);
486: }
489: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,MatFactorInfo *info,Mat *B)
490: {
491: Mat C = *B;
492: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
493: IS perm = b->row;
495: PetscInt *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
496: PetscInt *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
497: PetscInt bs=A->bs,bs2 = a->bs2;
498: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
499: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
500: MatScalar *work;
501: PetscInt *pivots;
504: /* initialization */
505: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
506: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
507: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
508: jl = il + mbs;
509: for (i=0; i<mbs; i++) {
510: jl[i] = mbs; il[0] = 0;
511: }
512: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
513: uik = dk + bs2;
514: work = uik + bs2;
515: PetscMalloc(bs*sizeof(PetscInt),&pivots);
516:
517: ISGetIndices(perm,&perm_ptr);
518:
519: /* check permutation */
520: if (!a->permute){
521: ai = a->i; aj = a->j; aa = a->a;
522: } else {
523: ai = a->inew; aj = a->jnew;
524: PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);
525: PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));
526: PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);
527: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));
529: for (i=0; i<mbs; i++){
530: jmin = ai[i]; jmax = ai[i+1];
531: for (j=jmin; j<jmax; j++){
532: while (a2anew[j] != j){
533: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
534: for (k1=0; k1<bs2; k1++){
535: dk[k1] = aa[k*bs2+k1];
536: aa[k*bs2+k1] = aa[j*bs2+k1];
537: aa[j*bs2+k1] = dk[k1];
538: }
539: }
540: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
541: if (i > aj[j]){
542: /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
543: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
544: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
545: for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */
546: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
547: }
548: }
549: }
550: }
551: PetscFree(a2anew);
552: }
553:
554: /* for each row k */
555: for (k = 0; k<mbs; k++){
557: /*initialize k-th row with elements nonzero in row perm(k) of A */
558: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
559:
560: ap = aa + jmin*bs2;
561: for (j = jmin; j < jmax; j++){
562: vj = perm_ptr[aj[j]]; /* block col. index */
563: rtmp_ptr = rtmp + vj*bs2;
564: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
565: }
567: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
568: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
569: i = jl[k]; /* first row to be added to k_th row */
571: while (i < k){
572: nexti = jl[i]; /* next row to be added to k_th row */
574: /* compute multiplier */
575: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
577: /* uik = -inv(Di)*U_bar(i,k) */
578: diag = ba + i*bs2;
579: u = ba + ili*bs2;
580: PetscMemzero(uik,bs2*sizeof(MatScalar));
581: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
582:
583: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
584: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
585:
586: /* update -U(i,k) */
587: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
589: /* add multiple of row i to k-th row ... */
590: jmin = ili + 1; jmax = bi[i+1];
591: if (jmin < jmax){
592: for (j=jmin; j<jmax; j++) {
593: /* rtmp += -U(i,k)^T * U_bar(i,j) */
594: rtmp_ptr = rtmp + bj[j]*bs2;
595: u = ba + j*bs2;
596: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
597: }
598:
599: /* ... add i to row list for next nonzero entry */
600: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
601: j = bj[jmin];
602: jl[i] = jl[j]; jl[j] = i; /* update jl */
603: }
604: i = nexti;
605: }
607: /* save nonzero entries in k-th row of U ... */
609: /* invert diagonal block */
610: diag = ba+k*bs2;
611: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
612: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
613:
614: jmin = bi[k]; jmax = bi[k+1];
615: if (jmin < jmax) {
616: for (j=jmin; j<jmax; j++){
617: vj = bj[j]; /* block col. index of U */
618: u = ba + j*bs2;
619: rtmp_ptr = rtmp + vj*bs2;
620: for (k1=0; k1<bs2; k1++){
621: *u++ = *rtmp_ptr;
622: *rtmp_ptr++ = 0.0;
623: }
624: }
625:
626: /* ... add k to row list for first nonzero entry in k-th row */
627: il[k] = jmin;
628: i = bj[jmin];
629: jl[k] = jl[i]; jl[i] = k;
630: }
631: }
633: PetscFree(rtmp);
634: PetscFree(il);
635: PetscFree(dk);
636: PetscFree(pivots);
637: if (a->permute){
638: PetscFree(aa);
639: }
641: ISRestoreIndices(perm,&perm_ptr);
642: C->factor = FACTOR_CHOLESKY;
643: C->assembled = PETSC_TRUE;
644: C->preallocated = PETSC_TRUE;
645: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
646: return(0);
647: }
651: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
652: {
653: Mat C = *B;
654: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
656: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
657: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
658: PetscInt bs=A->bs,bs2 = a->bs2;
659: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
660: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
661: MatScalar *work;
662: PetscInt *pivots;
665: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
666: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
667: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
668: jl = il + mbs;
669: for (i=0; i<mbs; i++) {
670: jl[i] = mbs; il[0] = 0;
671: }
672: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
673: uik = dk + bs2;
674: work = uik + bs2;
675: PetscMalloc(bs*sizeof(PetscInt),&pivots);
676:
677: ai = a->i; aj = a->j; aa = a->a;
678:
679: /* for each row k */
680: for (k = 0; k<mbs; k++){
682: /*initialize k-th row with elements nonzero in row k of A */
683: jmin = ai[k]; jmax = ai[k+1];
684: ap = aa + jmin*bs2;
685: for (j = jmin; j < jmax; j++){
686: vj = aj[j]; /* block col. index */
687: rtmp_ptr = rtmp + vj*bs2;
688: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
689: }
691: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
692: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
693: i = jl[k]; /* first row to be added to k_th row */
695: while (i < k){
696: nexti = jl[i]; /* next row to be added to k_th row */
698: /* compute multiplier */
699: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
701: /* uik = -inv(Di)*U_bar(i,k) */
702: diag = ba + i*bs2;
703: u = ba + ili*bs2;
704: PetscMemzero(uik,bs2*sizeof(MatScalar));
705: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
706:
707: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
708: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
709:
710: /* update -U(i,k) */
711: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
713: /* add multiple of row i to k-th row ... */
714: jmin = ili + 1; jmax = bi[i+1];
715: if (jmin < jmax){
716: for (j=jmin; j<jmax; j++) {
717: /* rtmp += -U(i,k)^T * U_bar(i,j) */
718: rtmp_ptr = rtmp + bj[j]*bs2;
719: u = ba + j*bs2;
720: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
721: }
722:
723: /* ... add i to row list for next nonzero entry */
724: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
725: j = bj[jmin];
726: jl[i] = jl[j]; jl[j] = i; /* update jl */
727: }
728: i = nexti;
729: }
731: /* save nonzero entries in k-th row of U ... */
733: /* invert diagonal block */
734: diag = ba+k*bs2;
735: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
736: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
737:
738: jmin = bi[k]; jmax = bi[k+1];
739: if (jmin < jmax) {
740: for (j=jmin; j<jmax; j++){
741: vj = bj[j]; /* block col. index of U */
742: u = ba + j*bs2;
743: rtmp_ptr = rtmp + vj*bs2;
744: for (k1=0; k1<bs2; k1++){
745: *u++ = *rtmp_ptr;
746: *rtmp_ptr++ = 0.0;
747: }
748: }
749:
750: /* ... add k to row list for first nonzero entry in k-th row */
751: il[k] = jmin;
752: i = bj[jmin];
753: jl[k] = jl[i]; jl[i] = k;
754: }
755: }
757: PetscFree(rtmp);
758: PetscFree(il);
759: PetscFree(dk);
760: PetscFree(pivots);
762: C->factor = FACTOR_CHOLESKY;
763: C->assembled = PETSC_TRUE;
764: C->preallocated = PETSC_TRUE;
765: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
766: return(0);
767: }
769: /*
770: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
771: Version for blocks 2 by 2.
772: */
775: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,MatFactorInfo *info,Mat *B)
776: {
777: Mat C = *B;
778: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
779: IS perm = b->row;
781: PetscInt *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
782: PetscInt *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
783: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
784: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
787: /* initialization */
788: /* il and jl record the first nonzero element in each row of the accessing
789: window U(0:k, k:mbs-1).
790: jl: list of rows to be added to uneliminated rows
791: i>= k: jl(i) is the first row to be added to row i
792: i< k: jl(i) is the row following row i in some list of rows
793: jl(i) = mbs indicates the end of a list
794: il(i): points to the first nonzero element in columns k,...,mbs-1 of
795: row i of U */
796: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
797: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
798: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
799: jl = il + mbs;
800: for (i=0; i<mbs; i++) {
801: jl[i] = mbs; il[0] = 0;
802: }
803: PetscMalloc(8*sizeof(MatScalar),&dk);
804: uik = dk + 4;
805: ISGetIndices(perm,&perm_ptr);
807: /* check permutation */
808: if (!a->permute){
809: ai = a->i; aj = a->j; aa = a->a;
810: } else {
811: ai = a->inew; aj = a->jnew;
812: PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);
813: PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));
814: PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);
815: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));
817: for (i=0; i<mbs; i++){
818: jmin = ai[i]; jmax = ai[i+1];
819: for (j=jmin; j<jmax; j++){
820: while (a2anew[j] != j){
821: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
822: for (k1=0; k1<4; k1++){
823: dk[k1] = aa[k*4+k1];
824: aa[k*4+k1] = aa[j*4+k1];
825: aa[j*4+k1] = dk[k1];
826: }
827: }
828: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
829: if (i > aj[j]){
830: /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
831: ap = aa + j*4; /* ptr to the beginning of the block */
832: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
833: ap[1] = ap[2];
834: ap[2] = dk[1];
835: }
836: }
837: }
838: PetscFree(a2anew);
839: }
841: /* for each row k */
842: for (k = 0; k<mbs; k++){
844: /*initialize k-th row with elements nonzero in row perm(k) of A */
845: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
846: ap = aa + jmin*4;
847: for (j = jmin; j < jmax; j++){
848: vj = perm_ptr[aj[j]]; /* block col. index */
849: rtmp_ptr = rtmp + vj*4;
850: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
851: }
853: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
854: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
855: i = jl[k]; /* first row to be added to k_th row */
857: while (i < k){
858: nexti = jl[i]; /* next row to be added to k_th row */
860: /* compute multiplier */
861: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
863: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
864: diag = ba + i*4;
865: u = ba + ili*4;
866: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
867: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
868: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
869: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
870:
871: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
872: dk[0] += uik[0]*u[0] + uik[1]*u[1];
873: dk[1] += uik[2]*u[0] + uik[3]*u[1];
874: dk[2] += uik[0]*u[2] + uik[1]*u[3];
875: dk[3] += uik[2]*u[2] + uik[3]*u[3];
877: /* update -U(i,k): ba[ili] = uik */
878: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
880: /* add multiple of row i to k-th row ... */
881: jmin = ili + 1; jmax = bi[i+1];
882: if (jmin < jmax){
883: for (j=jmin; j<jmax; j++) {
884: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
885: rtmp_ptr = rtmp + bj[j]*4;
886: u = ba + j*4;
887: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
888: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
889: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
890: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
891: }
892:
893: /* ... add i to row list for next nonzero entry */
894: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
895: j = bj[jmin];
896: jl[i] = jl[j]; jl[j] = i; /* update jl */
897: }
898: i = nexti;
899: }
901: /* save nonzero entries in k-th row of U ... */
903: /* invert diagonal block */
904: diag = ba+k*4;
905: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
906: Kernel_A_gets_inverse_A_2(diag);
907:
908: jmin = bi[k]; jmax = bi[k+1];
909: if (jmin < jmax) {
910: for (j=jmin; j<jmax; j++){
911: vj = bj[j]; /* block col. index of U */
912: u = ba + j*4;
913: rtmp_ptr = rtmp + vj*4;
914: for (k1=0; k1<4; k1++){
915: *u++ = *rtmp_ptr;
916: *rtmp_ptr++ = 0.0;
917: }
918: }
919:
920: /* ... add k to row list for first nonzero entry in k-th row */
921: il[k] = jmin;
922: i = bj[jmin];
923: jl[k] = jl[i]; jl[i] = k;
924: }
925: }
927: PetscFree(rtmp);
928: PetscFree(il);
929: PetscFree(dk);
930: if (a->permute) {
931: PetscFree(aa);
932: }
933: ISRestoreIndices(perm,&perm_ptr);
934: C->factor = FACTOR_CHOLESKY;
935: C->assembled = PETSC_TRUE;
936: C->preallocated = PETSC_TRUE;
937: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
938: return(0);
939: }
941: /*
942: Version for when blocks are 2 by 2 Using natural ordering
943: */
946: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
947: {
948: Mat C = *B;
949: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
951: PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
952: PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
953: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
954: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
957: /* initialization */
958: /* il and jl record the first nonzero element in each row of the accessing
959: window U(0:k, k:mbs-1).
960: jl: list of rows to be added to uneliminated rows
961: i>= k: jl(i) is the first row to be added to row i
962: i< k: jl(i) is the row following row i in some list of rows
963: jl(i) = mbs indicates the end of a list
964: il(i): points to the first nonzero element in columns k,...,mbs-1 of
965: row i of U */
966: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
967: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
968: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
969: jl = il + mbs;
970: for (i=0; i<mbs; i++) {
971: jl[i] = mbs; il[0] = 0;
972: }
973: PetscMalloc(8*sizeof(MatScalar),&dk);
974: uik = dk + 4;
975:
976: ai = a->i; aj = a->j; aa = a->a;
978: /* for each row k */
979: for (k = 0; k<mbs; k++){
981: /*initialize k-th row with elements nonzero in row k of A */
982: jmin = ai[k]; jmax = ai[k+1];
983: ap = aa + jmin*4;
984: for (j = jmin; j < jmax; j++){
985: vj = aj[j]; /* block col. index */
986: rtmp_ptr = rtmp + vj*4;
987: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
988: }
989:
990: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
991: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
992: i = jl[k]; /* first row to be added to k_th row */
994: while (i < k){
995: nexti = jl[i]; /* next row to be added to k_th row */
997: /* compute multiplier */
998: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1000: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1001: diag = ba + i*4;
1002: u = ba + ili*4;
1003: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1004: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1005: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1006: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1007:
1008: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1009: dk[0] += uik[0]*u[0] + uik[1]*u[1];
1010: dk[1] += uik[2]*u[0] + uik[3]*u[1];
1011: dk[2] += uik[0]*u[2] + uik[1]*u[3];
1012: dk[3] += uik[2]*u[2] + uik[3]*u[3];
1014: /* update -U(i,k): ba[ili] = uik */
1015: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
1017: /* add multiple of row i to k-th row ... */
1018: jmin = ili + 1; jmax = bi[i+1];
1019: if (jmin < jmax){
1020: for (j=jmin; j<jmax; j++) {
1021: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1022: rtmp_ptr = rtmp + bj[j]*4;
1023: u = ba + j*4;
1024: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1025: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1026: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1027: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1028: }
1029:
1030: /* ... add i to row list for next nonzero entry */
1031: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1032: j = bj[jmin];
1033: jl[i] = jl[j]; jl[j] = i; /* update jl */
1034: }
1035: i = nexti;
1036: }
1038: /* save nonzero entries in k-th row of U ... */
1040: /* invert diagonal block */
1041: diag = ba+k*4;
1042: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
1043: Kernel_A_gets_inverse_A_2(diag);
1044:
1045: jmin = bi[k]; jmax = bi[k+1];
1046: if (jmin < jmax) {
1047: for (j=jmin; j<jmax; j++){
1048: vj = bj[j]; /* block col. index of U */
1049: u = ba + j*4;
1050: rtmp_ptr = rtmp + vj*4;
1051: for (k1=0; k1<4; k1++){
1052: *u++ = *rtmp_ptr;
1053: *rtmp_ptr++ = 0.0;
1054: }
1055: }
1056:
1057: /* ... add k to row list for first nonzero entry in k-th row */
1058: il[k] = jmin;
1059: i = bj[jmin];
1060: jl[k] = jl[i]; jl[i] = k;
1061: }
1062: }
1064: PetscFree(rtmp);
1065: PetscFree(il);
1066: PetscFree(dk);
1068: C->factor = FACTOR_CHOLESKY;
1069: C->assembled = PETSC_TRUE;
1070: C->preallocated = PETSC_TRUE;
1071: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1072: return(0);
1073: }
1075: /*
1076: Numeric U^T*D*U factorization for SBAIJ format.
1077: Version for blocks are 1 by 1.
1078: */
1081: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,MatFactorInfo *info,Mat *B)
1082: {
1083: Mat C = *B;
1084: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1085: IS ip=b->row;
1087: PetscInt *rip,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1088: PetscInt *ai,*aj,*a2anew;
1089: PetscInt k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1090: MatScalar *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1091: PetscReal zeropivot,rs,shiftnz;
1092: PetscTruth shiftpd;
1093: ChShift_Ctx sctx;
1094: PetscInt newshift;
1097: /* initialization */
1098: shiftnz = info->shiftnz;
1099: shiftpd = info->shiftpd;
1100: zeropivot = info->zeropivot;
1102: ISGetIndices(ip,&rip);
1103: if (!a->permute){
1104: ai = a->i; aj = a->j; aa = a->a;
1105: } else {
1106: ai = a->inew; aj = a->jnew;
1107: nz = ai[mbs];
1108: PetscMalloc(nz*sizeof(MatScalar),&aa);
1109: a2anew = a->a2anew;
1110: bval = a->a;
1111: for (j=0; j<nz; j++){
1112: aa[a2anew[j]] = *(bval++);
1113: }
1114: }
1115:
1116: /* initialization */
1117: /* il and jl record the first nonzero element in each row of the accessing
1118: window U(0:k, k:mbs-1).
1119: jl: list of rows to be added to uneliminated rows
1120: i>= k: jl(i) is the first row to be added to row i
1121: i< k: jl(i) is the row following row i in some list of rows
1122: jl(i) = mbs indicates the end of a list
1123: il(i): points to the first nonzero element in columns k,...,mbs-1 of
1124: row i of U */
1125: nz = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
1126: PetscMalloc(nz,&il);
1127: jl = il + mbs;
1128: rtmp = (MatScalar*)(jl + mbs);
1130: sctx.shift_amount = 0;
1131: sctx.nshift = 0;
1132: do {
1133: sctx.chshift = PETSC_FALSE;
1134: for (i=0; i<mbs; i++) {
1135: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1136: }
1137:
1138: for (k = 0; k<mbs; k++){
1139: /*initialize k-th row by the perm[k]-th row of A */
1140: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1141: bval = ba + bi[k];
1142: for (j = jmin; j < jmax; j++){
1143: col = rip[aj[j]];
1144: rtmp[col] = aa[j];
1145: *bval++ = 0.0; /* for in-place factorization */
1146: }
1148: /* shift the diagonal of the matrix */
1149: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1151: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1152: dk = rtmp[k];
1153: i = jl[k]; /* first row to be added to k_th row */
1155: while (i < k){
1156: nexti = jl[i]; /* next row to be added to k_th row */
1158: /* compute multiplier, update diag(k) and U(i,k) */
1159: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1160: uikdi = - ba[ili]*ba[bi[i]]; /* diagonal(k) */
1161: dk += uikdi*ba[ili];
1162: ba[ili] = uikdi; /* -U(i,k) */
1164: /* add multiple of row i to k-th row */
1165: jmin = ili + 1; jmax = bi[i+1];
1166: if (jmin < jmax){
1167: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1168: /* update il and jl for row i */
1169: il[i] = jmin;
1170: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1171: }
1172: i = nexti;
1173: }
1175: /* shift the diagonals when zero pivot is detected */
1176: /* compute rs=sum of abs(off-diagonal) */
1177: rs = 0.0;
1178: jmin = bi[k]+1;
1179: nz = bi[k+1] - jmin;
1180: if (nz){
1181: bcol = bj + jmin;
1182: while (nz--){
1183: rs += PetscAbsScalar(rtmp[*bcol]);
1184: bcol++;
1185: }
1186: }
1188: sctx.rs = rs;
1189: sctx.pv = dk;
1190: MatCholeskyCheckShift_inline(info,sctx,newshift);
1191: if (newshift == 1){
1192: break; /* sctx.shift_amount is updated */
1193: } else if (newshift == -1){
1194: SETERRQ4(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g",k,PetscAbsScalar(dk),zeropivot,rs);
1195: }
1197: /* copy data into U(k,:) */
1198: ba[bi[k]] = 1.0/dk; /* U(k,k) */
1199: jmin = bi[k]+1; jmax = bi[k+1];
1200: if (jmin < jmax) {
1201: for (j=jmin; j<jmax; j++){
1202: col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1203: }
1204: /* add the k-th row into il and jl */
1205: il[k] = jmin;
1206: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1207: }
1208: }
1209: } while (sctx.chshift);
1210: PetscFree(il);
1211: if (a->permute){PetscFree(aa);}
1213: ISRestoreIndices(ip,&rip);
1214: C->factor = FACTOR_CHOLESKY;
1215: C->assembled = PETSC_TRUE;
1216: C->preallocated = PETSC_TRUE;
1217: PetscLogFlops(C->m);
1218: if (sctx.nshift){
1219: if (shiftnz) {
1220: PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1: number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));
1221: } else if (shiftpd) {
1222: PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1: number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));
1223: }
1224: }
1225: return(0);
1226: }
1228: /*
1229: Version for when blocks are 1 by 1 Using natural ordering
1230: */
1233: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
1234: {
1235: Mat C = *B;
1236: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1238: PetscInt i,j,mbs = a->mbs;
1239: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1240: PetscInt k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1241: MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1242: PetscReal zeropivot,rs,shiftnz;
1243: PetscTruth shiftpd;
1244: ChShift_Ctx sctx;
1245: PetscInt newshift;
1248: /* initialization */
1249: shiftnz = info->shiftnz;
1250: shiftpd = info->shiftpd;
1251: zeropivot = info->zeropivot;
1253: /* il and jl record the first nonzero element in each row of the accessing
1254: window U(0:k, k:mbs-1).
1255: jl: list of rows to be added to uneliminated rows
1256: i>= k: jl(i) is the first row to be added to row i
1257: i< k: jl(i) is the row following row i in some list of rows
1258: jl(i) = mbs indicates the end of a list
1259: il(i): points to the first nonzero element in U(i,k:mbs-1)
1260: */
1261: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
1262: PetscMalloc(2*mbs*sizeof(PetscInt),&il);
1263: jl = il + mbs;
1265: sctx.shift_amount = 0;
1266: sctx.nshift = 0;
1267: do {
1268: sctx.chshift = PETSC_FALSE;
1269: for (i=0; i<mbs; i++) {
1270: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1271: }
1273: for (k = 0; k<mbs; k++){
1274: /*initialize k-th row with elements nonzero in row perm(k) of A */
1275: nz = ai[k+1] - ai[k];
1276: acol = aj + ai[k];
1277: aval = aa + ai[k];
1278: bval = ba + bi[k];
1279: while (nz -- ){
1280: rtmp[*acol++] = *aval++;
1281: *bval++ = 0.0; /* for in-place factorization */
1282: }
1284: /* shift the diagonal of the matrix */
1285: if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1286:
1287: /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1288: dk = rtmp[k];
1289: i = jl[k]; /* first row to be added to k_th row */
1291: while (i < k){
1292: nexti = jl[i]; /* next row to be added to k_th row */
1293: /* compute multiplier, update D(k) and U(i,k) */
1294: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1295: uikdi = - ba[ili]*ba[bi[i]];
1296: dk += uikdi*ba[ili];
1297: ba[ili] = uikdi; /* -U(i,k) */
1299: /* add multiple of row i to k-th row ... */
1300: jmin = ili + 1;
1301: nz = bi[i+1] - jmin;
1302: if (nz > 0){
1303: bcol = bj + jmin;
1304: bval = ba + jmin;
1305: while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1306: /* update il and jl for i-th row */
1307: il[i] = jmin;
1308: j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1309: }
1310: i = nexti;
1311: }
1313: /* shift the diagonals when zero pivot is detected */
1314: /* compute rs=sum of abs(off-diagonal) */
1315: rs = 0.0;
1316: jmin = bi[k]+1;
1317: nz = bi[k+1] - jmin;
1318: if (nz){
1319: bcol = bj + jmin;
1320: while (nz--){
1321: rs += PetscAbsScalar(rtmp[*bcol]);
1322: bcol++;
1323: }
1324: }
1326: sctx.rs = rs;
1327: sctx.pv = dk;
1328: MatCholeskyCheckShift_inline(info,sctx,newshift);
1329: if (newshift == 1){
1330: break; /* sctx.shift_amount is updated */
1331: } else if (newshift == -1){
1332: SETERRQ4(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %g tolerance %g * rs %g",k,PetscAbsScalar(dk),zeropivot,rs);
1333: }
1334:
1335: /* copy data into U(k,:) */
1336: ba[bi[k]] = 1.0/dk;
1337: jmin = bi[k]+1;
1338: nz = bi[k+1] - jmin;
1339: if (nz){
1340: bcol = bj + jmin;
1341: bval = ba + jmin;
1342: while (nz--){
1343: *bval++ = rtmp[*bcol];
1344: rtmp[*bcol++] = 0.0;
1345: }
1346: /* add k-th row into il and jl */
1347: il[k] = jmin;
1348: i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1349: }
1350: } /* end of for (k = 0; k<mbs; k++) */
1351: } while (sctx.chshift);
1352: PetscFree(rtmp);
1353: PetscFree(il);
1354:
1355: C->factor = FACTOR_CHOLESKY;
1356: C->assembled = PETSC_TRUE;
1357: C->preallocated = PETSC_TRUE;
1358: PetscLogFlops(C->m);
1359: if (sctx.nshift){
1360: if (shiftnz) {
1361: PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering: number of shiftnz tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));
1362: } else if (shiftpd) {
1363: PetscLogInfo((0,"MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering: number of shiftpd tries %D, shift_amount %g\n",sctx.nshift,sctx.shift_amount));
1364: }
1365: }
1366: return(0);
1367: }
1371: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1372: {
1374: Mat C;
1377: MatCholeskyFactorSymbolic(A,perm,info,&C);
1378: MatCholeskyFactorNumeric(A,info,&C);
1379: MatHeaderCopy(A,C);
1380: return(0);
1381: }