Actual source code: baijfact2.c
1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include src/mat/impls/baij/seq/baij.h
7: #include src/vec/vecimpl.h
8: #include src/inline/ilu.h
9: #include src/inline/dot.h
11: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
12: {
13: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
14: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
15: int *diag = a->diag;
16: MatScalar *aa=a->a,*v;
17: PetscScalar s1,*x,*b;
20: VecCopy(bb,xx);
21: VecGetArray(bb,&b);
22: VecGetArray(xx,&x);
23:
24: /* forward solve the U^T */
25: for (i=0; i<n; i++) {
27: v = aa + diag[i];
28: /* multiply by the inverse of the block diagonal */
29: s1 = (*v++)*x[i];
30: vi = aj + diag[i] + 1;
31: nz = ai[i+1] - diag[i] - 1;
32: while (nz--) {
33: x[*vi++] -= (*v++)*s1;
34: }
35: x[i] = s1;
36: }
37: /* backward solve the L^T */
38: for (i=n-1; i>=0; i--){
39: v = aa + diag[i] - 1;
40: vi = aj + diag[i] - 1;
41: nz = diag[i] - ai[i];
42: s1 = x[i];
43: while (nz--) {
44: x[*vi--] -= (*v--)*s1;
45: }
46: }
47: VecRestoreArray(bb,&b);
48: VecRestoreArray(xx,&x);
49: PetscLogFlops(2*(a->nz) - A->n);
50: return(0);
51: }
53: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
54: {
55: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
56: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
57: int *diag = a->diag,oidx;
58: MatScalar *aa=a->a,*v;
59: PetscScalar s1,s2,x1,x2;
60: PetscScalar *x,*b;
63: VecCopy(bb,xx);
64: VecGetArray(bb,&b);
65: VecGetArray(xx,&x);
67: /* forward solve the U^T */
68: idx = 0;
69: for (i=0; i<n; i++) {
71: v = aa + 4*diag[i];
72: /* multiply by the inverse of the block diagonal */
73: x1 = x[idx]; x2 = x[1+idx];
74: s1 = v[0]*x1 + v[1]*x2;
75: s2 = v[2]*x1 + v[3]*x2;
76: v += 4;
78: vi = aj + diag[i] + 1;
79: nz = ai[i+1] - diag[i] - 1;
80: while (nz--) {
81: oidx = 2*(*vi++);
82: x[oidx] -= v[0]*s1 + v[1]*s2;
83: x[oidx+1] -= v[2]*s1 + v[3]*s2;
84: v += 4;
85: }
86: x[idx] = s1;x[1+idx] = s2;
87: idx += 2;
88: }
89: /* backward solve the L^T */
90: for (i=n-1; i>=0; i--){
91: v = aa + 4*diag[i] - 4;
92: vi = aj + diag[i] - 1;
93: nz = diag[i] - ai[i];
94: idt = 2*i;
95: s1 = x[idt]; s2 = x[1+idt];
96: while (nz--) {
97: idx = 2*(*vi--);
98: x[idx] -= v[0]*s1 + v[1]*s2;
99: x[idx+1] -= v[2]*s1 + v[3]*s2;
100: v -= 4;
101: }
102: }
103: VecRestoreArray(bb,&b);
104: VecRestoreArray(xx,&x);
105: PetscLogFlops(2*4*(a->nz) - 2*A->n);
106: return(0);
107: }
109: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
110: {
111: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
112: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
113: int *diag = a->diag,oidx;
114: MatScalar *aa=a->a,*v;
115: PetscScalar s1,s2,s3,x1,x2,x3;
116: PetscScalar *x,*b;
119: VecCopy(bb,xx);
120: VecGetArray(bb,&b);
121: VecGetArray(xx,&x);
123: /* forward solve the U^T */
124: idx = 0;
125: for (i=0; i<n; i++) {
127: v = aa + 9*diag[i];
128: /* multiply by the inverse of the block diagonal */
129: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
130: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
131: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
132: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
133: v += 9;
135: vi = aj + diag[i] + 1;
136: nz = ai[i+1] - diag[i] - 1;
137: while (nz--) {
138: oidx = 3*(*vi++);
139: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
140: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
141: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
142: v += 9;
143: }
144: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
145: idx += 3;
146: }
147: /* backward solve the L^T */
148: for (i=n-1; i>=0; i--){
149: v = aa + 9*diag[i] - 9;
150: vi = aj + diag[i] - 1;
151: nz = diag[i] - ai[i];
152: idt = 3*i;
153: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
154: while (nz--) {
155: idx = 3*(*vi--);
156: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
157: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
158: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
159: v -= 9;
160: }
161: }
162: VecRestoreArray(bb,&b);
163: VecRestoreArray(xx,&x);
164: PetscLogFlops(2*9*(a->nz) - 3*A->n);
165: return(0);
166: }
168: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
169: {
170: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
171: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
172: int *diag = a->diag,oidx;
173: MatScalar *aa=a->a,*v;
174: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
175: PetscScalar *x,*b;
178: VecCopy(bb,xx);
179: VecGetArray(bb,&b);
180: VecGetArray(xx,&x);
182: /* forward solve the U^T */
183: idx = 0;
184: for (i=0; i<n; i++) {
186: v = aa + 16*diag[i];
187: /* multiply by the inverse of the block diagonal */
188: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
189: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
190: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
191: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
192: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
193: v += 16;
195: vi = aj + diag[i] + 1;
196: nz = ai[i+1] - diag[i] - 1;
197: while (nz--) {
198: oidx = 4*(*vi++);
199: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
200: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
201: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
202: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
203: v += 16;
204: }
205: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
206: idx += 4;
207: }
208: /* backward solve the L^T */
209: for (i=n-1; i>=0; i--){
210: v = aa + 16*diag[i] - 16;
211: vi = aj + diag[i] - 1;
212: nz = diag[i] - ai[i];
213: idt = 4*i;
214: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
215: while (nz--) {
216: idx = 4*(*vi--);
217: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
218: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
219: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
220: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
221: v -= 16;
222: }
223: }
224: VecRestoreArray(bb,&b);
225: VecRestoreArray(xx,&x);
226: PetscLogFlops(2*16*(a->nz) - 4*A->n);
227: return(0);
228: }
230: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
231: {
232: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
233: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
234: int *diag = a->diag,oidx;
235: MatScalar *aa=a->a,*v;
236: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
237: PetscScalar *x,*b;
240: VecCopy(bb,xx);
241: VecGetArray(bb,&b);
242: VecGetArray(xx,&x);
244: /* forward solve the U^T */
245: idx = 0;
246: for (i=0; i<n; i++) {
248: v = aa + 25*diag[i];
249: /* multiply by the inverse of the block diagonal */
250: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
251: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
252: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
253: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
254: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
255: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
256: v += 25;
258: vi = aj + diag[i] + 1;
259: nz = ai[i+1] - diag[i] - 1;
260: while (nz--) {
261: oidx = 5*(*vi++);
262: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
263: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
264: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
265: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
266: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
267: v += 25;
268: }
269: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
270: idx += 5;
271: }
272: /* backward solve the L^T */
273: for (i=n-1; i>=0; i--){
274: v = aa + 25*diag[i] - 25;
275: vi = aj + diag[i] - 1;
276: nz = diag[i] - ai[i];
277: idt = 5*i;
278: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
279: while (nz--) {
280: idx = 5*(*vi--);
281: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
282: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
283: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
284: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
285: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
286: v -= 25;
287: }
288: }
289: VecRestoreArray(bb,&b);
290: VecRestoreArray(xx,&x);
291: PetscLogFlops(2*25*(a->nz) - 5*A->n);
292: return(0);
293: }
295: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
296: {
297: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
298: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
299: int *diag = a->diag,oidx;
300: MatScalar *aa=a->a,*v;
301: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
302: PetscScalar *x,*b;
305: VecCopy(bb,xx);
306: VecGetArray(bb,&b);
307: VecGetArray(xx,&x);
309: /* forward solve the U^T */
310: idx = 0;
311: for (i=0; i<n; i++) {
313: v = aa + 36*diag[i];
314: /* multiply by the inverse of the block diagonal */
315: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
316: x6 = x[5+idx];
317: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
318: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
319: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
320: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
321: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
322: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
323: v += 36;
325: vi = aj + diag[i] + 1;
326: nz = ai[i+1] - diag[i] - 1;
327: while (nz--) {
328: oidx = 6*(*vi++);
329: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
330: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
331: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
332: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
333: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
334: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
335: v += 36;
336: }
337: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
338: x[5+idx] = s6;
339: idx += 6;
340: }
341: /* backward solve the L^T */
342: for (i=n-1; i>=0; i--){
343: v = aa + 36*diag[i] - 36;
344: vi = aj + diag[i] - 1;
345: nz = diag[i] - ai[i];
346: idt = 6*i;
347: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
348: s6 = x[5+idt];
349: while (nz--) {
350: idx = 6*(*vi--);
351: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
352: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
353: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
354: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
355: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
356: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
357: v -= 36;
358: }
359: }
360: VecRestoreArray(bb,&b);
361: VecRestoreArray(xx,&x);
362: PetscLogFlops(2*36*(a->nz) - 6*A->n);
363: return(0);
364: }
366: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
367: {
368: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
369: int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
370: int *diag = a->diag,oidx;
371: MatScalar *aa=a->a,*v;
372: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
373: PetscScalar *x,*b;
376: VecCopy(bb,xx);
377: VecGetArray(bb,&b);
378: VecGetArray(xx,&x);
380: /* forward solve the U^T */
381: idx = 0;
382: for (i=0; i<n; i++) {
384: v = aa + 49*diag[i];
385: /* multiply by the inverse of the block diagonal */
386: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
387: x6 = x[5+idx]; x7 = x[6+idx];
388: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
389: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
390: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
391: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
392: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
393: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
394: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
395: v += 49;
397: vi = aj + diag[i] + 1;
398: nz = ai[i+1] - diag[i] - 1;
399: while (nz--) {
400: oidx = 7*(*vi++);
401: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
402: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
403: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
404: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
405: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
406: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
407: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
408: v += 49;
409: }
410: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
411: x[5+idx] = s6;x[6+idx] = s7;
412: idx += 7;
413: }
414: /* backward solve the L^T */
415: for (i=n-1; i>=0; i--){
416: v = aa + 49*diag[i] - 49;
417: vi = aj + diag[i] - 1;
418: nz = diag[i] - ai[i];
419: idt = 7*i;
420: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
421: s6 = x[5+idt];s7 = x[6+idt];
422: while (nz--) {
423: idx = 7*(*vi--);
424: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
425: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431: v -= 49;
432: }
433: }
434: VecRestoreArray(bb,&b);
435: VecRestoreArray(xx,&x);
436: PetscLogFlops(2*49*(a->nz) - 7*A->n);
437: return(0);
438: }
440: /*---------------------------------------------------------------------------------------------*/
441: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
442: {
443: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
444: IS iscol=a->col,isrow=a->row;
445: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
446: int *diag = a->diag;
447: MatScalar *aa=a->a,*v;
448: PetscScalar s1,*x,*b,*t;
451: VecGetArray(bb,&b);
452: VecGetArray(xx,&x);
453: t = a->solve_work;
455: ISGetIndices(isrow,&rout); r = rout;
456: ISGetIndices(iscol,&cout); c = cout;
458: /* copy the b into temp work space according to permutation */
459: for (i=0; i<n; i++) {
460: t[i] = b[c[i]];
461: }
463: /* forward solve the U^T */
464: for (i=0; i<n; i++) {
466: v = aa + diag[i];
467: /* multiply by the inverse of the block diagonal */
468: s1 = (*v++)*t[i];
469: vi = aj + diag[i] + 1;
470: nz = ai[i+1] - diag[i] - 1;
471: while (nz--) {
472: t[*vi++] -= (*v++)*s1;
473: }
474: t[i] = s1;
475: }
476: /* backward solve the L^T */
477: for (i=n-1; i>=0; i--){
478: v = aa + diag[i] - 1;
479: vi = aj + diag[i] - 1;
480: nz = diag[i] - ai[i];
481: s1 = t[i];
482: while (nz--) {
483: t[*vi--] -= (*v--)*s1;
484: }
485: }
487: /* copy t into x according to permutation */
488: for (i=0; i<n; i++) {
489: x[r[i]] = t[i];
490: }
492: ISRestoreIndices(isrow,&rout);
493: ISRestoreIndices(iscol,&cout);
494: VecRestoreArray(bb,&b);
495: VecRestoreArray(xx,&x);
496: PetscLogFlops(2*(a->nz) - A->n);
497: return(0);
498: }
500: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
501: {
502: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
503: IS iscol=a->col,isrow=a->row;
504: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
505: int *diag = a->diag,ii,ic,ir,oidx;
506: MatScalar *aa=a->a,*v;
507: PetscScalar s1,s2,x1,x2;
508: PetscScalar *x,*b,*t;
511: VecGetArray(bb,&b);
512: VecGetArray(xx,&x);
513: t = a->solve_work;
515: ISGetIndices(isrow,&rout); r = rout;
516: ISGetIndices(iscol,&cout); c = cout;
518: /* copy the b into temp work space according to permutation */
519: ii = 0;
520: for (i=0; i<n; i++) {
521: ic = 2*c[i];
522: t[ii] = b[ic];
523: t[ii+1] = b[ic+1];
524: ii += 2;
525: }
527: /* forward solve the U^T */
528: idx = 0;
529: for (i=0; i<n; i++) {
531: v = aa + 4*diag[i];
532: /* multiply by the inverse of the block diagonal */
533: x1 = t[idx]; x2 = t[1+idx];
534: s1 = v[0]*x1 + v[1]*x2;
535: s2 = v[2]*x1 + v[3]*x2;
536: v += 4;
538: vi = aj + diag[i] + 1;
539: nz = ai[i+1] - diag[i] - 1;
540: while (nz--) {
541: oidx = 2*(*vi++);
542: t[oidx] -= v[0]*s1 + v[1]*s2;
543: t[oidx+1] -= v[2]*s1 + v[3]*s2;
544: v += 4;
545: }
546: t[idx] = s1;t[1+idx] = s2;
547: idx += 2;
548: }
549: /* backward solve the L^T */
550: for (i=n-1; i>=0; i--){
551: v = aa + 4*diag[i] - 4;
552: vi = aj + diag[i] - 1;
553: nz = diag[i] - ai[i];
554: idt = 2*i;
555: s1 = t[idt]; s2 = t[1+idt];
556: while (nz--) {
557: idx = 2*(*vi--);
558: t[idx] -= v[0]*s1 + v[1]*s2;
559: t[idx+1] -= v[2]*s1 + v[3]*s2;
560: v -= 4;
561: }
562: }
564: /* copy t into x according to permutation */
565: ii = 0;
566: for (i=0; i<n; i++) {
567: ir = 2*r[i];
568: x[ir] = t[ii];
569: x[ir+1] = t[ii+1];
570: ii += 2;
571: }
573: ISRestoreIndices(isrow,&rout);
574: ISRestoreIndices(iscol,&cout);
575: VecRestoreArray(bb,&b);
576: VecRestoreArray(xx,&x);
577: PetscLogFlops(2*4*(a->nz) - 2*A->n);
578: return(0);
579: }
581: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
582: {
583: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
584: IS iscol=a->col,isrow=a->row;
585: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
586: int *diag = a->diag,ii,ic,ir,oidx;
587: MatScalar *aa=a->a,*v;
588: PetscScalar s1,s2,s3,x1,x2,x3;
589: PetscScalar *x,*b,*t;
592: VecGetArray(bb,&b);
593: VecGetArray(xx,&x);
594: t = a->solve_work;
596: ISGetIndices(isrow,&rout); r = rout;
597: ISGetIndices(iscol,&cout); c = cout;
599: /* copy the b into temp work space according to permutation */
600: ii = 0;
601: for (i=0; i<n; i++) {
602: ic = 3*c[i];
603: t[ii] = b[ic];
604: t[ii+1] = b[ic+1];
605: t[ii+2] = b[ic+2];
606: ii += 3;
607: }
609: /* forward solve the U^T */
610: idx = 0;
611: for (i=0; i<n; i++) {
613: v = aa + 9*diag[i];
614: /* multiply by the inverse of the block diagonal */
615: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
616: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
617: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
618: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
619: v += 9;
621: vi = aj + diag[i] + 1;
622: nz = ai[i+1] - diag[i] - 1;
623: while (nz--) {
624: oidx = 3*(*vi++);
625: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
626: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
627: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
628: v += 9;
629: }
630: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
631: idx += 3;
632: }
633: /* backward solve the L^T */
634: for (i=n-1; i>=0; i--){
635: v = aa + 9*diag[i] - 9;
636: vi = aj + diag[i] - 1;
637: nz = diag[i] - ai[i];
638: idt = 3*i;
639: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
640: while (nz--) {
641: idx = 3*(*vi--);
642: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
643: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
644: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
645: v -= 9;
646: }
647: }
649: /* copy t into x according to permutation */
650: ii = 0;
651: for (i=0; i<n; i++) {
652: ir = 3*r[i];
653: x[ir] = t[ii];
654: x[ir+1] = t[ii+1];
655: x[ir+2] = t[ii+2];
656: ii += 3;
657: }
659: ISRestoreIndices(isrow,&rout);
660: ISRestoreIndices(iscol,&cout);
661: VecRestoreArray(bb,&b);
662: VecRestoreArray(xx,&x);
663: PetscLogFlops(2*9*(a->nz) - 3*A->n);
664: return(0);
665: }
667: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
668: {
669: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
670: IS iscol=a->col,isrow=a->row;
671: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
672: int *diag = a->diag,ii,ic,ir,oidx;
673: MatScalar *aa=a->a,*v;
674: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
675: PetscScalar *x,*b,*t;
678: VecGetArray(bb,&b);
679: VecGetArray(xx,&x);
680: t = a->solve_work;
682: ISGetIndices(isrow,&rout); r = rout;
683: ISGetIndices(iscol,&cout); c = cout;
685: /* copy the b into temp work space according to permutation */
686: ii = 0;
687: for (i=0; i<n; i++) {
688: ic = 4*c[i];
689: t[ii] = b[ic];
690: t[ii+1] = b[ic+1];
691: t[ii+2] = b[ic+2];
692: t[ii+3] = b[ic+3];
693: ii += 4;
694: }
696: /* forward solve the U^T */
697: idx = 0;
698: for (i=0; i<n; i++) {
700: v = aa + 16*diag[i];
701: /* multiply by the inverse of the block diagonal */
702: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
703: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
704: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
705: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
706: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
707: v += 16;
709: vi = aj + diag[i] + 1;
710: nz = ai[i+1] - diag[i] - 1;
711: while (nz--) {
712: oidx = 4*(*vi++);
713: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
714: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
715: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
716: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
717: v += 16;
718: }
719: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
720: idx += 4;
721: }
722: /* backward solve the L^T */
723: for (i=n-1; i>=0; i--){
724: v = aa + 16*diag[i] - 16;
725: vi = aj + diag[i] - 1;
726: nz = diag[i] - ai[i];
727: idt = 4*i;
728: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
729: while (nz--) {
730: idx = 4*(*vi--);
731: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
732: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
733: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
734: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
735: v -= 16;
736: }
737: }
739: /* copy t into x according to permutation */
740: ii = 0;
741: for (i=0; i<n; i++) {
742: ir = 4*r[i];
743: x[ir] = t[ii];
744: x[ir+1] = t[ii+1];
745: x[ir+2] = t[ii+2];
746: x[ir+3] = t[ii+3];
747: ii += 4;
748: }
750: ISRestoreIndices(isrow,&rout);
751: ISRestoreIndices(iscol,&cout);
752: VecRestoreArray(bb,&b);
753: VecRestoreArray(xx,&x);
754: PetscLogFlops(2*16*(a->nz) - 4*A->n);
755: return(0);
756: }
758: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
759: {
760: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
761: IS iscol=a->col,isrow=a->row;
762: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
763: int *diag = a->diag,ii,ic,ir,oidx;
764: MatScalar *aa=a->a,*v;
765: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
766: PetscScalar *x,*b,*t;
769: VecGetArray(bb,&b);
770: VecGetArray(xx,&x);
771: t = a->solve_work;
773: ISGetIndices(isrow,&rout); r = rout;
774: ISGetIndices(iscol,&cout); c = cout;
776: /* copy the b into temp work space according to permutation */
777: ii = 0;
778: for (i=0; i<n; i++) {
779: ic = 5*c[i];
780: t[ii] = b[ic];
781: t[ii+1] = b[ic+1];
782: t[ii+2] = b[ic+2];
783: t[ii+3] = b[ic+3];
784: t[ii+4] = b[ic+4];
785: ii += 5;
786: }
788: /* forward solve the U^T */
789: idx = 0;
790: for (i=0; i<n; i++) {
792: v = aa + 25*diag[i];
793: /* multiply by the inverse of the block diagonal */
794: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
795: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
796: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
797: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
798: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
799: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
800: v += 25;
802: vi = aj + diag[i] + 1;
803: nz = ai[i+1] - diag[i] - 1;
804: while (nz--) {
805: oidx = 5*(*vi++);
806: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
807: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
808: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
809: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
810: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
811: v += 25;
812: }
813: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
814: idx += 5;
815: }
816: /* backward solve the L^T */
817: for (i=n-1; i>=0; i--){
818: v = aa + 25*diag[i] - 25;
819: vi = aj + diag[i] - 1;
820: nz = diag[i] - ai[i];
821: idt = 5*i;
822: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
823: while (nz--) {
824: idx = 5*(*vi--);
825: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
826: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
827: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
828: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
829: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
830: v -= 25;
831: }
832: }
834: /* copy t into x according to permutation */
835: ii = 0;
836: for (i=0; i<n; i++) {
837: ir = 5*r[i];
838: x[ir] = t[ii];
839: x[ir+1] = t[ii+1];
840: x[ir+2] = t[ii+2];
841: x[ir+3] = t[ii+3];
842: x[ir+4] = t[ii+4];
843: ii += 5;
844: }
846: ISRestoreIndices(isrow,&rout);
847: ISRestoreIndices(iscol,&cout);
848: VecRestoreArray(bb,&b);
849: VecRestoreArray(xx,&x);
850: PetscLogFlops(2*25*(a->nz) - 5*A->n);
851: return(0);
852: }
854: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
855: {
856: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
857: IS iscol=a->col,isrow=a->row;
858: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
859: int *diag = a->diag,ii,ic,ir,oidx;
860: MatScalar *aa=a->a,*v;
861: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
862: PetscScalar *x,*b,*t;
865: VecGetArray(bb,&b);
866: VecGetArray(xx,&x);
867: t = a->solve_work;
869: ISGetIndices(isrow,&rout); r = rout;
870: ISGetIndices(iscol,&cout); c = cout;
872: /* copy the b into temp work space according to permutation */
873: ii = 0;
874: for (i=0; i<n; i++) {
875: ic = 6*c[i];
876: t[ii] = b[ic];
877: t[ii+1] = b[ic+1];
878: t[ii+2] = b[ic+2];
879: t[ii+3] = b[ic+3];
880: t[ii+4] = b[ic+4];
881: t[ii+5] = b[ic+5];
882: ii += 6;
883: }
885: /* forward solve the U^T */
886: idx = 0;
887: for (i=0; i<n; i++) {
889: v = aa + 36*diag[i];
890: /* multiply by the inverse of the block diagonal */
891: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
892: x6 = t[5+idx];
893: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
894: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
895: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
896: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
897: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
898: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
899: v += 36;
901: vi = aj + diag[i] + 1;
902: nz = ai[i+1] - diag[i] - 1;
903: while (nz--) {
904: oidx = 6*(*vi++);
905: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
906: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
907: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
908: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
909: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
910: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
911: v += 36;
912: }
913: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
914: t[5+idx] = s6;
915: idx += 6;
916: }
917: /* backward solve the L^T */
918: for (i=n-1; i>=0; i--){
919: v = aa + 36*diag[i] - 36;
920: vi = aj + diag[i] - 1;
921: nz = diag[i] - ai[i];
922: idt = 6*i;
923: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
924: s6 = t[5+idt];
925: while (nz--) {
926: idx = 6*(*vi--);
927: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
928: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
929: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
930: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
931: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
932: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
933: v -= 36;
934: }
935: }
937: /* copy t into x according to permutation */
938: ii = 0;
939: for (i=0; i<n; i++) {
940: ir = 6*r[i];
941: x[ir] = t[ii];
942: x[ir+1] = t[ii+1];
943: x[ir+2] = t[ii+2];
944: x[ir+3] = t[ii+3];
945: x[ir+4] = t[ii+4];
946: x[ir+5] = t[ii+5];
947: ii += 6;
948: }
950: ISRestoreIndices(isrow,&rout);
951: ISRestoreIndices(iscol,&cout);
952: VecRestoreArray(bb,&b);
953: VecRestoreArray(xx,&x);
954: PetscLogFlops(2*36*(a->nz) - 6*A->n);
955: return(0);
956: }
958: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
959: {
960: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
961: IS iscol=a->col,isrow=a->row;
962: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
963: int *diag = a->diag,ii,ic,ir,oidx;
964: MatScalar *aa=a->a,*v;
965: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
966: PetscScalar *x,*b,*t;
969: VecGetArray(bb,&b);
970: VecGetArray(xx,&x);
971: t = a->solve_work;
973: ISGetIndices(isrow,&rout); r = rout;
974: ISGetIndices(iscol,&cout); c = cout;
976: /* copy the b into temp work space according to permutation */
977: ii = 0;
978: for (i=0; i<n; i++) {
979: ic = 7*c[i];
980: t[ii] = b[ic];
981: t[ii+1] = b[ic+1];
982: t[ii+2] = b[ic+2];
983: t[ii+3] = b[ic+3];
984: t[ii+4] = b[ic+4];
985: t[ii+5] = b[ic+5];
986: t[ii+6] = b[ic+6];
987: ii += 7;
988: }
990: /* forward solve the U^T */
991: idx = 0;
992: for (i=0; i<n; i++) {
994: v = aa + 49*diag[i];
995: /* multiply by the inverse of the block diagonal */
996: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
997: x6 = t[5+idx]; x7 = t[6+idx];
998: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
999: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1000: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1001: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1002: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1003: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1004: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1005: v += 49;
1007: vi = aj + diag[i] + 1;
1008: nz = ai[i+1] - diag[i] - 1;
1009: while (nz--) {
1010: oidx = 7*(*vi++);
1011: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1012: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1013: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1014: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1015: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1016: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1017: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1018: v += 49;
1019: }
1020: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1021: t[5+idx] = s6;t[6+idx] = s7;
1022: idx += 7;
1023: }
1024: /* backward solve the L^T */
1025: for (i=n-1; i>=0; i--){
1026: v = aa + 49*diag[i] - 49;
1027: vi = aj + diag[i] - 1;
1028: nz = diag[i] - ai[i];
1029: idt = 7*i;
1030: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1031: s6 = t[5+idt];s7 = t[6+idt];
1032: while (nz--) {
1033: idx = 7*(*vi--);
1034: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1035: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1036: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1037: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1038: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1039: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1040: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1041: v -= 49;
1042: }
1043: }
1045: /* copy t into x according to permutation */
1046: ii = 0;
1047: for (i=0; i<n; i++) {
1048: ir = 7*r[i];
1049: x[ir] = t[ii];
1050: x[ir+1] = t[ii+1];
1051: x[ir+2] = t[ii+2];
1052: x[ir+3] = t[ii+3];
1053: x[ir+4] = t[ii+4];
1054: x[ir+5] = t[ii+5];
1055: x[ir+6] = t[ii+6];
1056: ii += 7;
1057: }
1059: ISRestoreIndices(isrow,&rout);
1060: ISRestoreIndices(iscol,&cout);
1061: VecRestoreArray(bb,&b);
1062: VecRestoreArray(xx,&x);
1063: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1064: return(0);
1065: }
1067: /* ----------------------------------------------------------- */
1068: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1069: {
1070: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1071: IS iscol=a->col,isrow=a->row;
1072: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1073: int nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1074: MatScalar *aa=a->a,*v;
1075: PetscScalar *x,*b,*s,*t,*ls;
1078: VecGetArray(bb,&b);
1079: VecGetArray(xx,&x);
1080: t = a->solve_work;
1082: ISGetIndices(isrow,&rout); r = rout;
1083: ISGetIndices(iscol,&cout); c = cout + (n-1);
1085: /* forward solve the lower triangular */
1086: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1087: for (i=1; i<n; i++) {
1088: v = aa + bs2*ai[i];
1089: vi = aj + ai[i];
1090: nz = a->diag[i] - ai[i];
1091: s = t + bs*i;
1092: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1093: while (nz--) {
1094: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1095: v += bs2;
1096: }
1097: }
1098: /* backward solve the upper triangular */
1099: ls = a->solve_work + A->n;
1100: for (i=n-1; i>=0; i--){
1101: v = aa + bs2*(a->diag[i] + 1);
1102: vi = aj + a->diag[i] + 1;
1103: nz = ai[i+1] - a->diag[i] - 1;
1104: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1105: while (nz--) {
1106: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1107: v += bs2;
1108: }
1109: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1110: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1111: }
1113: ISRestoreIndices(isrow,&rout);
1114: ISRestoreIndices(iscol,&cout);
1115: VecRestoreArray(bb,&b);
1116: VecRestoreArray(xx,&x);
1117: PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1118: return(0);
1119: }
1121: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1122: {
1123: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1124: IS iscol=a->col,isrow=a->row;
1125: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1126: int *diag = a->diag;
1127: MatScalar *aa=a->a,*v;
1128: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1129: PetscScalar *x,*b,*t;
1132: VecGetArray(bb,&b);
1133: VecGetArray(xx,&x);
1134: t = a->solve_work;
1136: ISGetIndices(isrow,&rout); r = rout;
1137: ISGetIndices(iscol,&cout); c = cout + (n-1);
1139: /* forward solve the lower triangular */
1140: idx = 7*(*r++);
1141: t[0] = b[idx]; t[1] = b[1+idx];
1142: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1143: t[5] = b[5+idx]; t[6] = b[6+idx];
1145: for (i=1; i<n; i++) {
1146: v = aa + 49*ai[i];
1147: vi = aj + ai[i];
1148: nz = diag[i] - ai[i];
1149: idx = 7*(*r++);
1150: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1151: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1152: while (nz--) {
1153: idx = 7*(*vi++);
1154: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1155: x4 = t[3+idx];x5 = t[4+idx];
1156: x6 = t[5+idx];x7 = t[6+idx];
1157: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1158: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1159: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1160: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1161: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1162: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1163: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1164: v += 49;
1165: }
1166: idx = 7*i;
1167: t[idx] = s1;t[1+idx] = s2;
1168: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1169: t[5+idx] = s6;t[6+idx] = s7;
1170: }
1171: /* backward solve the upper triangular */
1172: for (i=n-1; i>=0; i--){
1173: v = aa + 49*diag[i] + 49;
1174: vi = aj + diag[i] + 1;
1175: nz = ai[i+1] - diag[i] - 1;
1176: idt = 7*i;
1177: s1 = t[idt]; s2 = t[1+idt];
1178: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1179: s6 = t[5+idt];s7 = t[6+idt];
1180: while (nz--) {
1181: idx = 7*(*vi++);
1182: x1 = t[idx]; x2 = t[1+idx];
1183: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1184: x6 = t[5+idx]; x7 = t[6+idx];
1185: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1186: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1187: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1188: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1189: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1190: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1191: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1192: v += 49;
1193: }
1194: idc = 7*(*c--);
1195: v = aa + 49*diag[i];
1196: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1197: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1198: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1199: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1200: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1201: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1202: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1203: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1204: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1205: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1206: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1207: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1208: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1209: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1210: }
1212: ISRestoreIndices(isrow,&rout);
1213: ISRestoreIndices(iscol,&cout);
1214: VecRestoreArray(bb,&b);
1215: VecRestoreArray(xx,&x);
1216: PetscLogFlops(2*49*(a->nz) - 7*A->n);
1217: return(0);
1218: }
1220: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1221: {
1222: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1223: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1224: int ierr,*diag = a->diag,jdx;
1225: MatScalar *aa=a->a,*v;
1226: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1229: VecGetArray(bb,&b);
1230: VecGetArray(xx,&x);
1231: /* forward solve the lower triangular */
1232: idx = 0;
1233: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1234: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1235: x[6] = b[6+idx];
1236: for (i=1; i<n; i++) {
1237: v = aa + 49*ai[i];
1238: vi = aj + ai[i];
1239: nz = diag[i] - ai[i];
1240: idx = 7*i;
1241: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1242: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1243: s7 = b[6+idx];
1244: while (nz--) {
1245: jdx = 7*(*vi++);
1246: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1247: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1248: x7 = x[6+jdx];
1249: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1250: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1251: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1252: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1253: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1254: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1255: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1256: v += 49;
1257: }
1258: x[idx] = s1;
1259: x[1+idx] = s2;
1260: x[2+idx] = s3;
1261: x[3+idx] = s4;
1262: x[4+idx] = s5;
1263: x[5+idx] = s6;
1264: x[6+idx] = s7;
1265: }
1266: /* backward solve the upper triangular */
1267: for (i=n-1; i>=0; i--){
1268: v = aa + 49*diag[i] + 49;
1269: vi = aj + diag[i] + 1;
1270: nz = ai[i+1] - diag[i] - 1;
1271: idt = 7*i;
1272: s1 = x[idt]; s2 = x[1+idt];
1273: s3 = x[2+idt]; s4 = x[3+idt];
1274: s5 = x[4+idt]; s6 = x[5+idt];
1275: s7 = x[6+idt];
1276: while (nz--) {
1277: idx = 7*(*vi++);
1278: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1279: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1280: x7 = x[6+idx];
1281: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1282: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1283: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1284: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1285: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1286: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1287: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1288: v += 49;
1289: }
1290: v = aa + 49*diag[i];
1291: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1292: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1293: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1294: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1295: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1296: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1297: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1298: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1299: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1300: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1301: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1302: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1303: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1304: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1305: }
1307: VecRestoreArray(bb,&b);
1308: VecRestoreArray(xx,&x);
1309: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1310: return(0);
1311: }
1313: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1314: {
1315: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1316: IS iscol=a->col,isrow=a->row;
1317: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1318: int *diag = a->diag;
1319: MatScalar *aa=a->a,*v;
1320: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1323: VecGetArray(bb,&b);
1324: VecGetArray(xx,&x);
1325: t = a->solve_work;
1327: ISGetIndices(isrow,&rout); r = rout;
1328: ISGetIndices(iscol,&cout); c = cout + (n-1);
1330: /* forward solve the lower triangular */
1331: idx = 6*(*r++);
1332: t[0] = b[idx]; t[1] = b[1+idx];
1333: t[2] = b[2+idx]; t[3] = b[3+idx];
1334: t[4] = b[4+idx]; t[5] = b[5+idx];
1335: for (i=1; i<n; i++) {
1336: v = aa + 36*ai[i];
1337: vi = aj + ai[i];
1338: nz = diag[i] - ai[i];
1339: idx = 6*(*r++);
1340: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1341: s5 = b[4+idx]; s6 = b[5+idx];
1342: while (nz--) {
1343: idx = 6*(*vi++);
1344: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1345: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1346: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1347: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1348: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1349: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1350: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1351: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1352: v += 36;
1353: }
1354: idx = 6*i;
1355: t[idx] = s1;t[1+idx] = s2;
1356: t[2+idx] = s3;t[3+idx] = s4;
1357: t[4+idx] = s5;t[5+idx] = s6;
1358: }
1359: /* backward solve the upper triangular */
1360: for (i=n-1; i>=0; i--){
1361: v = aa + 36*diag[i] + 36;
1362: vi = aj + diag[i] + 1;
1363: nz = ai[i+1] - diag[i] - 1;
1364: idt = 6*i;
1365: s1 = t[idt]; s2 = t[1+idt];
1366: s3 = t[2+idt];s4 = t[3+idt];
1367: s5 = t[4+idt];s6 = t[5+idt];
1368: while (nz--) {
1369: idx = 6*(*vi++);
1370: x1 = t[idx]; x2 = t[1+idx];
1371: x3 = t[2+idx]; x4 = t[3+idx];
1372: x5 = t[4+idx]; x6 = t[5+idx];
1373: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1374: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1375: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1376: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1377: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1378: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1379: v += 36;
1380: }
1381: idc = 6*(*c--);
1382: v = aa + 36*diag[i];
1383: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1384: v[18]*s4+v[24]*s5+v[30]*s6;
1385: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1386: v[19]*s4+v[25]*s5+v[31]*s6;
1387: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1388: v[20]*s4+v[26]*s5+v[32]*s6;
1389: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1390: v[21]*s4+v[27]*s5+v[33]*s6;
1391: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1392: v[22]*s4+v[28]*s5+v[34]*s6;
1393: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1394: v[23]*s4+v[29]*s5+v[35]*s6;
1395: }
1397: ISRestoreIndices(isrow,&rout);
1398: ISRestoreIndices(iscol,&cout);
1399: VecRestoreArray(bb,&b);
1400: VecRestoreArray(xx,&x);
1401: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1402: return(0);
1403: }
1405: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1406: {
1407: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1408: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1409: int ierr,*diag = a->diag,jdx;
1410: MatScalar *aa=a->a,*v;
1411: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1414: VecGetArray(bb,&b);
1415: VecGetArray(xx,&x);
1416: /* forward solve the lower triangular */
1417: idx = 0;
1418: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1419: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1420: for (i=1; i<n; i++) {
1421: v = aa + 36*ai[i];
1422: vi = aj + ai[i];
1423: nz = diag[i] - ai[i];
1424: idx = 6*i;
1425: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1426: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1427: while (nz--) {
1428: jdx = 6*(*vi++);
1429: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1430: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1431: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1432: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1433: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1434: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1435: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1436: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1437: v += 36;
1438: }
1439: x[idx] = s1;
1440: x[1+idx] = s2;
1441: x[2+idx] = s3;
1442: x[3+idx] = s4;
1443: x[4+idx] = s5;
1444: x[5+idx] = s6;
1445: }
1446: /* backward solve the upper triangular */
1447: for (i=n-1; i>=0; i--){
1448: v = aa + 36*diag[i] + 36;
1449: vi = aj + diag[i] + 1;
1450: nz = ai[i+1] - diag[i] - 1;
1451: idt = 6*i;
1452: s1 = x[idt]; s2 = x[1+idt];
1453: s3 = x[2+idt]; s4 = x[3+idt];
1454: s5 = x[4+idt]; s6 = x[5+idt];
1455: while (nz--) {
1456: idx = 6*(*vi++);
1457: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1458: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1459: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1460: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1461: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1462: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1463: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1464: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1465: v += 36;
1466: }
1467: v = aa + 36*diag[i];
1468: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1469: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1470: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1471: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1472: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1473: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1474: }
1476: VecRestoreArray(bb,&b);
1477: VecRestoreArray(xx,&x);
1478: PetscLogFlops(2*36*(a->nz) - 6*A->n);
1479: return(0);
1480: }
1482: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1483: {
1484: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1485: IS iscol=a->col,isrow=a->row;
1486: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1487: int *diag = a->diag;
1488: MatScalar *aa=a->a,*v;
1489: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1492: VecGetArray(bb,&b);
1493: VecGetArray(xx,&x);
1494: t = a->solve_work;
1496: ISGetIndices(isrow,&rout); r = rout;
1497: ISGetIndices(iscol,&cout); c = cout + (n-1);
1499: /* forward solve the lower triangular */
1500: idx = 5*(*r++);
1501: t[0] = b[idx]; t[1] = b[1+idx];
1502: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1503: for (i=1; i<n; i++) {
1504: v = aa + 25*ai[i];
1505: vi = aj + ai[i];
1506: nz = diag[i] - ai[i];
1507: idx = 5*(*r++);
1508: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1509: s5 = b[4+idx];
1510: while (nz--) {
1511: idx = 5*(*vi++);
1512: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1513: x4 = t[3+idx];x5 = t[4+idx];
1514: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1515: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1516: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1517: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1518: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1519: v += 25;
1520: }
1521: idx = 5*i;
1522: t[idx] = s1;t[1+idx] = s2;
1523: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1524: }
1525: /* backward solve the upper triangular */
1526: for (i=n-1; i>=0; i--){
1527: v = aa + 25*diag[i] + 25;
1528: vi = aj + diag[i] + 1;
1529: nz = ai[i+1] - diag[i] - 1;
1530: idt = 5*i;
1531: s1 = t[idt]; s2 = t[1+idt];
1532: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1533: while (nz--) {
1534: idx = 5*(*vi++);
1535: x1 = t[idx]; x2 = t[1+idx];
1536: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1537: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1538: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1539: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1540: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1541: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1542: v += 25;
1543: }
1544: idc = 5*(*c--);
1545: v = aa + 25*diag[i];
1546: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1547: v[15]*s4+v[20]*s5;
1548: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1549: v[16]*s4+v[21]*s5;
1550: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1551: v[17]*s4+v[22]*s5;
1552: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1553: v[18]*s4+v[23]*s5;
1554: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1555: v[19]*s4+v[24]*s5;
1556: }
1558: ISRestoreIndices(isrow,&rout);
1559: ISRestoreIndices(iscol,&cout);
1560: VecRestoreArray(bb,&b);
1561: VecRestoreArray(xx,&x);
1562: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1563: return(0);
1564: }
1566: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1567: {
1568: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1569: int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1570: int ierr,*diag = a->diag,jdx;
1571: MatScalar *aa=a->a,*v;
1572: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1575: VecGetArray(bb,&b);
1576: VecGetArray(xx,&x);
1577: /* forward solve the lower triangular */
1578: idx = 0;
1579: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1580: for (i=1; i<n; i++) {
1581: v = aa + 25*ai[i];
1582: vi = aj + ai[i];
1583: nz = diag[i] - ai[i];
1584: idx = 5*i;
1585: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1586: while (nz--) {
1587: jdx = 5*(*vi++);
1588: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1589: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1590: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1591: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1592: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1593: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1594: v += 25;
1595: }
1596: x[idx] = s1;
1597: x[1+idx] = s2;
1598: x[2+idx] = s3;
1599: x[3+idx] = s4;
1600: x[4+idx] = s5;
1601: }
1602: /* backward solve the upper triangular */
1603: for (i=n-1; i>=0; i--){
1604: v = aa + 25*diag[i] + 25;
1605: vi = aj + diag[i] + 1;
1606: nz = ai[i+1] - diag[i] - 1;
1607: idt = 5*i;
1608: s1 = x[idt]; s2 = x[1+idt];
1609: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1610: while (nz--) {
1611: idx = 5*(*vi++);
1612: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1613: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1614: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1615: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1616: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1617: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1618: v += 25;
1619: }
1620: v = aa + 25*diag[i];
1621: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1622: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1623: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1624: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1625: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1626: }
1628: VecRestoreArray(bb,&b);
1629: VecRestoreArray(xx,&x);
1630: PetscLogFlops(2*25*(a->nz) - 5*A->n);
1631: return(0);
1632: }
1634: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1635: {
1636: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1637: IS iscol=a->col,isrow=a->row;
1638: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1639: int *diag = a->diag;
1640: MatScalar *aa=a->a,*v;
1641: PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1644: VecGetArray(bb,&b);
1645: VecGetArray(xx,&x);
1646: t = a->solve_work;
1648: ISGetIndices(isrow,&rout); r = rout;
1649: ISGetIndices(iscol,&cout); c = cout + (n-1);
1651: /* forward solve the lower triangular */
1652: idx = 4*(*r++);
1653: t[0] = b[idx]; t[1] = b[1+idx];
1654: t[2] = b[2+idx]; t[3] = b[3+idx];
1655: for (i=1; i<n; i++) {
1656: v = aa + 16*ai[i];
1657: vi = aj + ai[i];
1658: nz = diag[i] - ai[i];
1659: idx = 4*(*r++);
1660: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1661: while (nz--) {
1662: idx = 4*(*vi++);
1663: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1664: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1665: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1666: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1667: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1668: v += 16;
1669: }
1670: idx = 4*i;
1671: t[idx] = s1;t[1+idx] = s2;
1672: t[2+idx] = s3;t[3+idx] = s4;
1673: }
1674: /* backward solve the upper triangular */
1675: for (i=n-1; i>=0; i--){
1676: v = aa + 16*diag[i] + 16;
1677: vi = aj + diag[i] + 1;
1678: nz = ai[i+1] - diag[i] - 1;
1679: idt = 4*i;
1680: s1 = t[idt]; s2 = t[1+idt];
1681: s3 = t[2+idt];s4 = t[3+idt];
1682: while (nz--) {
1683: idx = 4*(*vi++);
1684: x1 = t[idx]; x2 = t[1+idx];
1685: x3 = t[2+idx]; x4 = t[3+idx];
1686: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1687: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1688: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1689: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1690: v += 16;
1691: }
1692: idc = 4*(*c--);
1693: v = aa + 16*diag[i];
1694: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1695: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1696: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1697: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1698: }
1700: ISRestoreIndices(isrow,&rout);
1701: ISRestoreIndices(iscol,&cout);
1702: VecRestoreArray(bb,&b);
1703: VecRestoreArray(xx,&x);
1704: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1705: return(0);
1706: }
1707: #if defined (PETSC_HAVE_SSE)
1709: #include PETSC_HAVE_SSE
1711: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1712: {
1713: /*
1714: Note: This code uses demotion of double
1715: to float when performing the mixed-mode computation.
1716: This may not be numerically reasonable for all applications.
1717: */
1718: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1719: IS iscol=a->col,isrow=a->row;
1720: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1721: int *diag = a->diag,ai16;
1722: MatScalar *aa=a->a,*v;
1723: PetscScalar *x,*b,*t;
1725: /* Make space in temp stack for 16 Byte Aligned arrays */
1726: float ssealignedspace[11],*tmps,*tmpx;
1727: unsigned long offset;
1728:
1730: SSE_SCOPE_BEGIN;
1732: offset = (unsigned long)ssealignedspace % 16;
1733: if (offset) offset = (16 - offset)/4;
1734: tmps = &ssealignedspace[offset];
1735: tmpx = &ssealignedspace[offset+4];
1736: PREFETCH_NTA(aa+16*ai[1]);
1738: VecGetArray(bb,&b);
1739: VecGetArray(xx,&x);
1740: t = a->solve_work;
1742: ISGetIndices(isrow,&rout); r = rout;
1743: ISGetIndices(iscol,&cout); c = cout + (n-1);
1745: /* forward solve the lower triangular */
1746: idx = 4*(*r++);
1747: t[0] = b[idx]; t[1] = b[1+idx];
1748: t[2] = b[2+idx]; t[3] = b[3+idx];
1749: v = aa + 16*ai[1];
1751: for (i=1; i<n;) {
1752: PREFETCH_NTA(&v[8]);
1753: vi = aj + ai[i];
1754: nz = diag[i] - ai[i];
1755: idx = 4*(*r++);
1757: /* Demote sum from double to float */
1758: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1759: LOAD_PS(tmps,XMM7);
1761: while (nz--) {
1762: PREFETCH_NTA(&v[16]);
1763: idx = 4*(*vi++);
1764:
1765: /* Demote solution (so far) from double to float */
1766: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1768: /* 4x4 Matrix-Vector product with negative accumulation: */
1769: SSE_INLINE_BEGIN_2(tmpx,v)
1770: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1772: /* First Column */
1773: SSE_COPY_PS(XMM0,XMM6)
1774: SSE_SHUFFLE(XMM0,XMM0,0x00)
1775: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1776: SSE_SUB_PS(XMM7,XMM0)
1777:
1778: /* Second Column */
1779: SSE_COPY_PS(XMM1,XMM6)
1780: SSE_SHUFFLE(XMM1,XMM1,0x55)
1781: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1782: SSE_SUB_PS(XMM7,XMM1)
1783:
1784: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1785:
1786: /* Third Column */
1787: SSE_COPY_PS(XMM2,XMM6)
1788: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1789: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1790: SSE_SUB_PS(XMM7,XMM2)
1792: /* Fourth Column */
1793: SSE_COPY_PS(XMM3,XMM6)
1794: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1795: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1796: SSE_SUB_PS(XMM7,XMM3)
1797: SSE_INLINE_END_2
1798:
1799: v += 16;
1800: }
1801: idx = 4*i;
1802: v = aa + 16*ai[++i];
1803: PREFETCH_NTA(v);
1804: STORE_PS(tmps,XMM7);
1806: /* Promote result from float to double */
1807: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1808: }
1809: /* backward solve the upper triangular */
1810: idt = 4*(n-1);
1811: ai16 = 16*diag[n-1];
1812: v = aa + ai16 + 16;
1813: for (i=n-1; i>=0;){
1814: PREFETCH_NTA(&v[8]);
1815: vi = aj + diag[i] + 1;
1816: nz = ai[i+1] - diag[i] - 1;
1817:
1818: /* Demote accumulator from double to float */
1819: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1820: LOAD_PS(tmps,XMM7);
1822: while (nz--) {
1823: PREFETCH_NTA(&v[16]);
1824: idx = 4*(*vi++);
1826: /* Demote solution (so far) from double to float */
1827: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1829: /* 4x4 Matrix-Vector Product with negative accumulation: */
1830: SSE_INLINE_BEGIN_2(tmpx,v)
1831: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1833: /* First Column */
1834: SSE_COPY_PS(XMM0,XMM6)
1835: SSE_SHUFFLE(XMM0,XMM0,0x00)
1836: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1837: SSE_SUB_PS(XMM7,XMM0)
1839: /* Second Column */
1840: SSE_COPY_PS(XMM1,XMM6)
1841: SSE_SHUFFLE(XMM1,XMM1,0x55)
1842: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1843: SSE_SUB_PS(XMM7,XMM1)
1845: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1846:
1847: /* Third Column */
1848: SSE_COPY_PS(XMM2,XMM6)
1849: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1850: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1851: SSE_SUB_PS(XMM7,XMM2)
1853: /* Fourth Column */
1854: SSE_COPY_PS(XMM3,XMM6)
1855: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1856: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1857: SSE_SUB_PS(XMM7,XMM3)
1858: SSE_INLINE_END_2
1859: v += 16;
1860: }
1861: v = aa + ai16;
1862: ai16 = 16*diag[--i];
1863: PREFETCH_NTA(aa+ai16+16);
1864: /*
1865: Scale the result by the diagonal 4x4 block,
1866: which was inverted as part of the factorization
1867: */
1868: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1869: /* First Column */
1870: SSE_COPY_PS(XMM0,XMM7)
1871: SSE_SHUFFLE(XMM0,XMM0,0x00)
1872: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
1874: /* Second Column */
1875: SSE_COPY_PS(XMM1,XMM7)
1876: SSE_SHUFFLE(XMM1,XMM1,0x55)
1877: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1878: SSE_ADD_PS(XMM0,XMM1)
1880: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1881:
1882: /* Third Column */
1883: SSE_COPY_PS(XMM2,XMM7)
1884: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1885: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1886: SSE_ADD_PS(XMM0,XMM2)
1888: /* Fourth Column */
1889: SSE_COPY_PS(XMM3,XMM7)
1890: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1891: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1892: SSE_ADD_PS(XMM0,XMM3)
1893:
1894: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1895: SSE_INLINE_END_3
1897: /* Promote solution from float to double */
1898: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
1900: /* Apply reordering to t and stream into x. */
1901: /* This way, x doesn't pollute the cache. */
1902: /* Be careful with size: 2 doubles = 4 floats! */
1903: idc = 4*(*c--);
1904: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
1905: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
1906: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1907: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1908: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1909: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1910: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1911: SSE_INLINE_END_2
1912: v = aa + ai16 + 16;
1913: idt -= 4;
1914: }
1916: ISRestoreIndices(isrow,&rout);
1917: ISRestoreIndices(iscol,&cout);
1918: VecRestoreArray(bb,&b);
1919: VecRestoreArray(xx,&x);
1920: PetscLogFlops(2*16*(a->nz) - 4*A->n);
1921: SSE_SCOPE_END;
1922: return(0);
1923: }
1925: #endif
1928: /*
1929: Special case where the matrix was ILU(0) factored in the natural
1930: ordering. This eliminates the need for the column and row permutation.
1931: */
1932: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1933: {
1934: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1935: int n=a->mbs,*ai=a->i,*aj=a->j;
1936: int ierr,*diag = a->diag;
1937: MatScalar *aa=a->a;
1938: PetscScalar *x,*b;
1941: VecGetArray(bb,&b);
1942: VecGetArray(xx,&x);
1944: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1945: {
1946: static PetscScalar w[2000]; /* very BAD need to fix */
1947: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1948: }
1949: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1950: {
1951: static PetscScalar w[2000]; /* very BAD need to fix */
1952: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1953: }
1954: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1955: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1956: #else
1957: {
1958: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
1959: MatScalar *v;
1960: int jdx,idt,idx,nz,*vi,i,ai16;
1962: /* forward solve the lower triangular */
1963: idx = 0;
1964: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1965: for (i=1; i<n; i++) {
1966: v = aa + 16*ai[i];
1967: vi = aj + ai[i];
1968: nz = diag[i] - ai[i];
1969: idx += 4;
1970: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1971: while (nz--) {
1972: jdx = 4*(*vi++);
1973: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1974: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1975: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1976: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1977: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1978: v += 16;
1979: }
1980: x[idx] = s1;
1981: x[1+idx] = s2;
1982: x[2+idx] = s3;
1983: x[3+idx] = s4;
1984: }
1985: /* backward solve the upper triangular */
1986: idt = 4*(n-1);
1987: for (i=n-1; i>=0; i--){
1988: ai16 = 16*diag[i];
1989: v = aa + ai16 + 16;
1990: vi = aj + diag[i] + 1;
1991: nz = ai[i+1] - diag[i] - 1;
1992: s1 = x[idt]; s2 = x[1+idt];
1993: s3 = x[2+idt];s4 = x[3+idt];
1994: while (nz--) {
1995: idx = 4*(*vi++);
1996: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
1997: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1998: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1999: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2000: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2001: v += 16;
2002: }
2003: v = aa + ai16;
2004: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2005: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2006: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2007: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2008: idt -= 4;
2009: }
2010: }
2011: #endif
2013: VecRestoreArray(bb,&b);
2014: VecRestoreArray(xx,&x);
2015: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2016: return(0);
2017: }
2019: #if defined (PETSC_HAVE_SSE)
2021: #include PETSC_HAVE_SSE
2023: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2024: {
2025: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2026: int n=a->mbs,*ai=a->i,*aj=a->j;
2027: int ierr,*diag = a->diag;
2028: MatScalar *aa=a->a;
2029: PetscScalar *x,*b;
2032: SSE_SCOPE_BEGIN;
2033: /*
2034: Note: This code currently uses demotion of double
2035: to float when performing the mixed-mode computation.
2036: This may not be numerically reasonable for all applications.
2037: */
2038: PREFETCH_NTA(aa+16*ai[1]);
2040: VecGetArray(bb,&b);
2041: VecGetArray(xx,&x);
2042: {
2043: MatScalar *v;
2044: int jdx,idt,idx,nz,*vi,i,ai16;
2046: /* Make space in temp stack for 16 Byte Aligned arrays */
2047: float ssealignedspace[11],*tmps,*tmpx;
2048: unsigned long offset = (unsigned long)ssealignedspace % 16;
2049: if (offset) offset = (16 - offset)/4;
2050: tmps = &ssealignedspace[offset];
2051: tmpx = &ssealignedspace[offset+4];
2053: /* forward solve the lower triangular */
2054: idx = 0;
2055: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2056: v = aa + 16*ai[1];
2058: for (i=1; i<n;) {
2059: PREFETCH_NTA(&v[8]);
2060: vi = aj + ai[i];
2061: nz = diag[i] - ai[i];
2062: idx += 4;
2064: /* Demote sum from double to float */
2065: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2066: LOAD_PS(tmps,XMM7);
2068: while (nz--) {
2069: PREFETCH_NTA(&v[16]);
2070: jdx = 4*(*vi++);
2071:
2072: /* Demote solution (so far) from double to float */
2073: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
2075: /* 4x4 Matrix-Vector product with negative accumulation: */
2076: SSE_INLINE_BEGIN_2(tmpx,v)
2077: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2079: /* First Column */
2080: SSE_COPY_PS(XMM0,XMM6)
2081: SSE_SHUFFLE(XMM0,XMM0,0x00)
2082: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2083: SSE_SUB_PS(XMM7,XMM0)
2085: /* Second Column */
2086: SSE_COPY_PS(XMM1,XMM6)
2087: SSE_SHUFFLE(XMM1,XMM1,0x55)
2088: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2089: SSE_SUB_PS(XMM7,XMM1)
2091: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2092:
2093: /* Third Column */
2094: SSE_COPY_PS(XMM2,XMM6)
2095: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2096: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2097: SSE_SUB_PS(XMM7,XMM2)
2099: /* Fourth Column */
2100: SSE_COPY_PS(XMM3,XMM6)
2101: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2102: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2103: SSE_SUB_PS(XMM7,XMM3)
2104: SSE_INLINE_END_2
2105:
2106: v += 16;
2107: }
2108: v = aa + 16*ai[++i];
2109: PREFETCH_NTA(v);
2110: STORE_PS(tmps,XMM7);
2112: /* Promote result from float to double */
2113: CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
2114: }
2115: /* backward solve the upper triangular */
2116: idt = 4*(n-1);
2117: ai16 = 16*diag[n-1];
2118: v = aa + ai16 + 16;
2119: for (i=n-1; i>=0;){
2120: PREFETCH_NTA(&v[8]);
2121: vi = aj + diag[i] + 1;
2122: nz = ai[i+1] - diag[i] - 1;
2123:
2124: /* Demote accumulator from double to float */
2125: CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
2126: LOAD_PS(tmps,XMM7);
2128: while (nz--) {
2129: PREFETCH_NTA(&v[16]);
2130: idx = 4*(*vi++);
2132: /* Demote solution (so far) from double to float */
2133: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2135: /* 4x4 Matrix-Vector Product with negative accumulation: */
2136: SSE_INLINE_BEGIN_2(tmpx,v)
2137: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2139: /* First Column */
2140: SSE_COPY_PS(XMM0,XMM6)
2141: SSE_SHUFFLE(XMM0,XMM0,0x00)
2142: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2143: SSE_SUB_PS(XMM7,XMM0)
2145: /* Second Column */
2146: SSE_COPY_PS(XMM1,XMM6)
2147: SSE_SHUFFLE(XMM1,XMM1,0x55)
2148: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2149: SSE_SUB_PS(XMM7,XMM1)
2151: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2152:
2153: /* Third Column */
2154: SSE_COPY_PS(XMM2,XMM6)
2155: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2156: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2157: SSE_SUB_PS(XMM7,XMM2)
2159: /* Fourth Column */
2160: SSE_COPY_PS(XMM3,XMM6)
2161: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2162: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2163: SSE_SUB_PS(XMM7,XMM3)
2164: SSE_INLINE_END_2
2165: v += 16;
2166: }
2167: v = aa + ai16;
2168: ai16 = 16*diag[--i];
2169: PREFETCH_NTA(aa+ai16+16);
2170: /*
2171: Scale the result by the diagonal 4x4 block,
2172: which was inverted as part of the factorization
2173: */
2174: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2175: /* First Column */
2176: SSE_COPY_PS(XMM0,XMM7)
2177: SSE_SHUFFLE(XMM0,XMM0,0x00)
2178: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2180: /* Second Column */
2181: SSE_COPY_PS(XMM1,XMM7)
2182: SSE_SHUFFLE(XMM1,XMM1,0x55)
2183: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2184: SSE_ADD_PS(XMM0,XMM1)
2186: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2187:
2188: /* Third Column */
2189: SSE_COPY_PS(XMM2,XMM7)
2190: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2191: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2192: SSE_ADD_PS(XMM0,XMM2)
2194: /* Fourth Column */
2195: SSE_COPY_PS(XMM3,XMM7)
2196: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2197: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2198: SSE_ADD_PS(XMM0,XMM3)
2200: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2201: SSE_INLINE_END_3
2203: /* Promote solution from float to double */
2204: CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
2206: v = aa + ai16 + 16;
2207: idt -= 4;
2208: }
2209: }
2210: VecRestoreArray(bb,&b);
2211: VecRestoreArray(xx,&x);
2212: PetscLogFlops(2*16*(a->nz) - 4*A->n);
2213: SSE_SCOPE_END;
2214: return(0);
2215: }
2217: #endif
2218: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2219: {
2220: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2221: IS iscol=a->col,isrow=a->row;
2222: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2223: int *diag = a->diag;
2224: MatScalar *aa=a->a,*v;
2225: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t;
2228: VecGetArray(bb,&b);
2229: VecGetArray(xx,&x);
2230: t = a->solve_work;
2232: ISGetIndices(isrow,&rout); r = rout;
2233: ISGetIndices(iscol,&cout); c = cout + (n-1);
2235: /* forward solve the lower triangular */
2236: idx = 3*(*r++);
2237: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2238: for (i=1; i<n; i++) {
2239: v = aa + 9*ai[i];
2240: vi = aj + ai[i];
2241: nz = diag[i] - ai[i];
2242: idx = 3*(*r++);
2243: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2244: while (nz--) {
2245: idx = 3*(*vi++);
2246: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2247: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2248: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2249: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2250: v += 9;
2251: }
2252: idx = 3*i;
2253: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2254: }
2255: /* backward solve the upper triangular */
2256: for (i=n-1; i>=0; i--){
2257: v = aa + 9*diag[i] + 9;
2258: vi = aj + diag[i] + 1;
2259: nz = ai[i+1] - diag[i] - 1;
2260: idt = 3*i;
2261: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2262: while (nz--) {
2263: idx = 3*(*vi++);
2264: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2265: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2266: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2267: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2268: v += 9;
2269: }
2270: idc = 3*(*c--);
2271: v = aa + 9*diag[i];
2272: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2273: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2274: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2275: }
2276: ISRestoreIndices(isrow,&rout);
2277: ISRestoreIndices(iscol,&cout);
2278: VecRestoreArray(bb,&b);
2279: VecRestoreArray(xx,&x);
2280: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2281: return(0);
2282: }
2284: /*
2285: Special case where the matrix was ILU(0) factored in the natural
2286: ordering. This eliminates the need for the column and row permutation.
2287: */
2288: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2289: {
2290: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2291: int n=a->mbs,*ai=a->i,*aj=a->j;
2292: int ierr,*diag = a->diag;
2293: MatScalar *aa=a->a,*v;
2294: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3;
2295: int jdx,idt,idx,nz,*vi,i;
2298: VecGetArray(bb,&b);
2299: VecGetArray(xx,&x);
2302: /* forward solve the lower triangular */
2303: idx = 0;
2304: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2305: for (i=1; i<n; i++) {
2306: v = aa + 9*ai[i];
2307: vi = aj + ai[i];
2308: nz = diag[i] - ai[i];
2309: idx += 3;
2310: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2311: while (nz--) {
2312: jdx = 3*(*vi++);
2313: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2314: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2315: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2316: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2317: v += 9;
2318: }
2319: x[idx] = s1;
2320: x[1+idx] = s2;
2321: x[2+idx] = s3;
2322: }
2323: /* backward solve the upper triangular */
2324: for (i=n-1; i>=0; i--){
2325: v = aa + 9*diag[i] + 9;
2326: vi = aj + diag[i] + 1;
2327: nz = ai[i+1] - diag[i] - 1;
2328: idt = 3*i;
2329: s1 = x[idt]; s2 = x[1+idt];
2330: s3 = x[2+idt];
2331: while (nz--) {
2332: idx = 3*(*vi++);
2333: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2334: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2335: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2336: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2337: v += 9;
2338: }
2339: v = aa + 9*diag[i];
2340: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2341: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2342: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2343: }
2345: VecRestoreArray(bb,&b);
2346: VecRestoreArray(xx,&x);
2347: PetscLogFlops(2*9*(a->nz) - 3*A->n);
2348: return(0);
2349: }
2351: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2352: {
2353: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2354: IS iscol=a->col,isrow=a->row;
2355: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2356: int *diag = a->diag;
2357: MatScalar *aa=a->a,*v;
2358: PetscScalar *x,*b,s1,s2,x1,x2,*t;
2361: VecGetArray(bb,&b);
2362: VecGetArray(xx,&x);
2363: t = a->solve_work;
2365: ISGetIndices(isrow,&rout); r = rout;
2366: ISGetIndices(iscol,&cout); c = cout + (n-1);
2368: /* forward solve the lower triangular */
2369: idx = 2*(*r++);
2370: t[0] = b[idx]; t[1] = b[1+idx];
2371: for (i=1; i<n; i++) {
2372: v = aa + 4*ai[i];
2373: vi = aj + ai[i];
2374: nz = diag[i] - ai[i];
2375: idx = 2*(*r++);
2376: s1 = b[idx]; s2 = b[1+idx];
2377: while (nz--) {
2378: idx = 2*(*vi++);
2379: x1 = t[idx]; x2 = t[1+idx];
2380: s1 -= v[0]*x1 + v[2]*x2;
2381: s2 -= v[1]*x1 + v[3]*x2;
2382: v += 4;
2383: }
2384: idx = 2*i;
2385: t[idx] = s1; t[1+idx] = s2;
2386: }
2387: /* backward solve the upper triangular */
2388: for (i=n-1; i>=0; i--){
2389: v = aa + 4*diag[i] + 4;
2390: vi = aj + diag[i] + 1;
2391: nz = ai[i+1] - diag[i] - 1;
2392: idt = 2*i;
2393: s1 = t[idt]; s2 = t[1+idt];
2394: while (nz--) {
2395: idx = 2*(*vi++);
2396: x1 = t[idx]; x2 = t[1+idx];
2397: s1 -= v[0]*x1 + v[2]*x2;
2398: s2 -= v[1]*x1 + v[3]*x2;
2399: v += 4;
2400: }
2401: idc = 2*(*c--);
2402: v = aa + 4*diag[i];
2403: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2404: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2405: }
2406: ISRestoreIndices(isrow,&rout);
2407: ISRestoreIndices(iscol,&cout);
2408: VecRestoreArray(bb,&b);
2409: VecRestoreArray(xx,&x);
2410: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2411: return(0);
2412: }
2414: /*
2415: Special case where the matrix was ILU(0) factored in the natural
2416: ordering. This eliminates the need for the column and row permutation.
2417: */
2418: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2419: {
2420: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2421: int n=a->mbs,*ai=a->i,*aj=a->j;
2422: int ierr,*diag = a->diag;
2423: MatScalar *aa=a->a,*v;
2424: PetscScalar *x,*b,s1,s2,x1,x2;
2425: int jdx,idt,idx,nz,*vi,i;
2428: VecGetArray(bb,&b);
2429: VecGetArray(xx,&x);
2431: /* forward solve the lower triangular */
2432: idx = 0;
2433: x[0] = b[0]; x[1] = b[1];
2434: for (i=1; i<n; i++) {
2435: v = aa + 4*ai[i];
2436: vi = aj + ai[i];
2437: nz = diag[i] - ai[i];
2438: idx += 2;
2439: s1 = b[idx];s2 = b[1+idx];
2440: while (nz--) {
2441: jdx = 2*(*vi++);
2442: x1 = x[jdx];x2 = x[1+jdx];
2443: s1 -= v[0]*x1 + v[2]*x2;
2444: s2 -= v[1]*x1 + v[3]*x2;
2445: v += 4;
2446: }
2447: x[idx] = s1;
2448: x[1+idx] = s2;
2449: }
2450: /* backward solve the upper triangular */
2451: for (i=n-1; i>=0; i--){
2452: v = aa + 4*diag[i] + 4;
2453: vi = aj + diag[i] + 1;
2454: nz = ai[i+1] - diag[i] - 1;
2455: idt = 2*i;
2456: s1 = x[idt]; s2 = x[1+idt];
2457: while (nz--) {
2458: idx = 2*(*vi++);
2459: x1 = x[idx]; x2 = x[1+idx];
2460: s1 -= v[0]*x1 + v[2]*x2;
2461: s2 -= v[1]*x1 + v[3]*x2;
2462: v += 4;
2463: }
2464: v = aa + 4*diag[i];
2465: x[idt] = v[0]*s1 + v[2]*s2;
2466: x[1+idt] = v[1]*s1 + v[3]*s2;
2467: }
2469: VecRestoreArray(bb,&b);
2470: VecRestoreArray(xx,&x);
2471: PetscLogFlops(2*4*(a->nz) - 2*A->n);
2472: return(0);
2473: }
2475: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2476: {
2477: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2478: IS iscol=a->col,isrow=a->row;
2479: int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2480: int *diag = a->diag;
2481: MatScalar *aa=a->a,*v;
2482: PetscScalar *x,*b,s1,*t;
2485: if (!n) return(0);
2487: VecGetArray(bb,&b);
2488: VecGetArray(xx,&x);
2489: t = a->solve_work;
2491: ISGetIndices(isrow,&rout); r = rout;
2492: ISGetIndices(iscol,&cout); c = cout + (n-1);
2494: /* forward solve the lower triangular */
2495: t[0] = b[*r++];
2496: for (i=1; i<n; i++) {
2497: v = aa + ai[i];
2498: vi = aj + ai[i];
2499: nz = diag[i] - ai[i];
2500: s1 = b[*r++];
2501: while (nz--) {
2502: s1 -= (*v++)*t[*vi++];
2503: }
2504: t[i] = s1;
2505: }
2506: /* backward solve the upper triangular */
2507: for (i=n-1; i>=0; i--){
2508: v = aa + diag[i] + 1;
2509: vi = aj + diag[i] + 1;
2510: nz = ai[i+1] - diag[i] - 1;
2511: s1 = t[i];
2512: while (nz--) {
2513: s1 -= (*v++)*t[*vi++];
2514: }
2515: x[*c--] = t[i] = aa[diag[i]]*s1;
2516: }
2518: ISRestoreIndices(isrow,&rout);
2519: ISRestoreIndices(iscol,&cout);
2520: VecRestoreArray(bb,&b);
2521: VecRestoreArray(xx,&x);
2522: PetscLogFlops(2*1*(a->nz) - A->n);
2523: return(0);
2524: }
2525: /*
2526: Special case where the matrix was ILU(0) factored in the natural
2527: ordering. This eliminates the need for the column and row permutation.
2528: */
2529: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2530: {
2531: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2532: int n=a->mbs,*ai=a->i,*aj=a->j;
2533: int ierr,*diag = a->diag;
2534: MatScalar *aa=a->a;
2535: PetscScalar *x,*b;
2536: PetscScalar s1,x1;
2537: MatScalar *v;
2538: int jdx,idt,idx,nz,*vi,i;
2541: VecGetArray(bb,&b);
2542: VecGetArray(xx,&x);
2544: /* forward solve the lower triangular */
2545: idx = 0;
2546: x[0] = b[0];
2547: for (i=1; i<n; i++) {
2548: v = aa + ai[i];
2549: vi = aj + ai[i];
2550: nz = diag[i] - ai[i];
2551: idx += 1;
2552: s1 = b[idx];
2553: while (nz--) {
2554: jdx = *vi++;
2555: x1 = x[jdx];
2556: s1 -= v[0]*x1;
2557: v += 1;
2558: }
2559: x[idx] = s1;
2560: }
2561: /* backward solve the upper triangular */
2562: for (i=n-1; i>=0; i--){
2563: v = aa + diag[i] + 1;
2564: vi = aj + diag[i] + 1;
2565: nz = ai[i+1] - diag[i] - 1;
2566: idt = i;
2567: s1 = x[idt];
2568: while (nz--) {
2569: idx = *vi++;
2570: x1 = x[idx];
2571: s1 -= v[0]*x1;
2572: v += 1;
2573: }
2574: v = aa + diag[i];
2575: x[idt] = v[0]*s1;
2576: }
2577: VecRestoreArray(bb,&b);
2578: VecRestoreArray(xx,&x);
2579: PetscLogFlops(2*(a->nz) - A->n);
2580: return(0);
2581: }
2583: /* ----------------------------------------------------------------*/
2584: /*
2585: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2586: except that the data structure of Mat_SeqAIJ is slightly different.
2587: Not a good example of code reuse.
2588: */
2589: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2591: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2592: {
2593: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2594: IS isicol;
2595: int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2596: int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2597: int *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2598: int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2599: PetscTruth col_identity,row_identity;
2600: PetscReal f;
2603: if (info) {
2604: f = info->fill;
2605: levels = (int)info->levels;
2606: diagonal_fill = (int)info->diagonal_fill;
2607: } else {
2608: f = 1.0;
2609: levels = 0;
2610: diagonal_fill = 0;
2611: }
2612: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
2613: ISIdentity(isrow,&row_identity);
2614: ISIdentity(iscol,&col_identity);
2616: if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */
2617: MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
2618: (*fact)->factor = FACTOR_LU;
2619: b = (Mat_SeqBAIJ*)(*fact)->data;
2620: if (!b->diag) {
2621: MatMarkDiagonal_SeqBAIJ(*fact);
2622: }
2623: MatMissingDiagonal_SeqBAIJ(*fact);
2624: b->row = isrow;
2625: b->col = iscol;
2626: ierr = PetscObjectReference((PetscObject)isrow);
2627: ierr = PetscObjectReference((PetscObject)iscol);
2628: b->icol = isicol;
2629: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
2630: ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
2631: } else { /* general case perform the symbolic factorization */
2632: ISGetIndices(isrow,&r);
2633: ISGetIndices(isicol,&ic);
2635: /* get new row pointers */
2636: PetscMalloc((n+1)*sizeof(int),&ainew);
2637: ainew[0] = 0;
2638: /* don't know how many column pointers are needed so estimate */
2639: jmax = (int)(f*ai[n] + 1);
2640: PetscMalloc((jmax)*sizeof(int),&ajnew);
2641: /* ajfill is level of fill for each fill entry */
2642: PetscMalloc((jmax)*sizeof(int),&ajfill);
2643: /* fill is a linked list of nonzeros in active row */
2644: PetscMalloc((n+1)*sizeof(int),&fill);
2645: /* im is level for each filled value */
2646: PetscMalloc((n+1)*sizeof(int),&im);
2647: /* dloc is location of diagonal in factor */
2648: PetscMalloc((n+1)*sizeof(int),&dloc);
2649: dloc[0] = 0;
2650: for (prow=0; prow<n; prow++) {
2652: /* copy prow into linked list */
2653: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
2654: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2655: xi = aj + ai[r[prow]];
2656: fill[n] = n;
2657: fill[prow] = -1; /* marker for diagonal entry */
2658: while (nz--) {
2659: fm = n;
2660: idx = ic[*xi++];
2661: do {
2662: m = fm;
2663: fm = fill[m];
2664: } while (fm < idx);
2665: fill[m] = idx;
2666: fill[idx] = fm;
2667: im[idx] = 0;
2668: }
2670: /* make sure diagonal entry is included */
2671: if (diagonal_fill && fill[prow] == -1) {
2672: fm = n;
2673: while (fill[fm] < prow) fm = fill[fm];
2674: fill[prow] = fill[fm]; /* insert diagonal into linked list */
2675: fill[fm] = prow;
2676: im[prow] = 0;
2677: nzf++;
2678: dcount++;
2679: }
2681: nzi = 0;
2682: row = fill[n];
2683: while (row < prow) {
2684: incrlev = im[row] + 1;
2685: nz = dloc[row];
2686: xi = ajnew + ainew[row] + nz + 1;
2687: flev = ajfill + ainew[row] + nz + 1;
2688: nnz = ainew[row+1] - ainew[row] - nz - 1;
2689: fm = row;
2690: while (nnz-- > 0) {
2691: idx = *xi++;
2692: if (*flev + incrlev > levels) {
2693: flev++;
2694: continue;
2695: }
2696: do {
2697: m = fm;
2698: fm = fill[m];
2699: } while (fm < idx);
2700: if (fm != idx) {
2701: im[idx] = *flev + incrlev;
2702: fill[m] = idx;
2703: fill[idx] = fm;
2704: fm = idx;
2705: nzf++;
2706: } else {
2707: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2708: }
2709: flev++;
2710: }
2711: row = fill[row];
2712: nzi++;
2713: }
2714: /* copy new filled row into permanent storage */
2715: ainew[prow+1] = ainew[prow] + nzf;
2716: if (ainew[prow+1] > jmax) {
2718: /* estimate how much additional space we will need */
2719: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2720: /* just double the memory each time */
2721: int maxadd = jmax;
2722: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2723: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2724: jmax += maxadd;
2726: /* allocate a longer ajnew and ajfill */
2727: PetscMalloc(jmax*sizeof(int),&xi);
2728: PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
2729: PetscFree(ajnew);
2730: ajnew = xi;
2731: PetscMalloc(jmax*sizeof(int),&xi);
2732: PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
2733: PetscFree(ajfill);
2734: ajfill = xi;
2735: realloc++; /* count how many reallocations are needed */
2736: }
2737: xi = ajnew + ainew[prow];
2738: flev = ajfill + ainew[prow];
2739: dloc[prow] = nzi;
2740: fm = fill[n];
2741: while (nzf--) {
2742: *xi++ = fm;
2743: *flev++ = im[fm];
2744: fm = fill[fm];
2745: }
2746: /* make sure row has diagonal entry */
2747: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2748: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrixn
2749: try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2750: }
2751: }
2752: PetscFree(ajfill);
2753: ISRestoreIndices(isrow,&r);
2754: ISRestoreIndices(isicol,&ic);
2755: PetscFree(fill);
2756: PetscFree(im);
2758: {
2759: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2760: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
2761: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use n",af);
2762: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);n",af);
2763: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.n");
2764: if (diagonal_fill) {
2765: PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
2766: }
2767: }
2769: /* put together the new matrix */
2770: MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);
2771: PetscLogObjectParent(*fact,isicol);
2772: b = (Mat_SeqBAIJ*)(*fact)->data;
2773: PetscFree(b->imax);
2774: b->singlemalloc = PETSC_FALSE;
2775: len = bs2*ainew[n]*sizeof(MatScalar);
2776: /* the next line frees the default space generated by the Create() */
2777: PetscFree(b->a);
2778: PetscFree(b->ilen);
2779: PetscMalloc(len,&b->a);
2780: b->j = ajnew;
2781: b->i = ainew;
2782: for (i=0; i<n; i++) dloc[i] += ainew[i];
2783: b->diag = dloc;
2784: b->ilen = 0;
2785: b->imax = 0;
2786: b->row = isrow;
2787: b->col = iscol;
2788: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
2789: ierr = PetscObjectReference((PetscObject)isrow);
2790: ierr = PetscObjectReference((PetscObject)iscol);
2791: b->icol = isicol;
2792: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
2793: /* In b structure: Free imax, ilen, old a, old j.
2794: Allocate dloc, solve_work, new a, new j */
2795: PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
2796: b->maxnz = b->nz = ainew[n];
2797: (*fact)->factor = FACTOR_LU;
2799: (*fact)->info.factor_mallocs = realloc;
2800: (*fact)->info.fill_ratio_given = f;
2801: (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2802: }
2804: if (row_identity && col_identity) {
2805: MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
2806: }
2807: return(0);
2808: }
2810: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
2811: {
2812: /*
2813: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2814: with natural ordering
2815: */
2816: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2817: PetscTruth sse_enabled_local,sse_enabled_global;
2818: int ierr;
2821: inA->ops->solve = MatSolve_SeqBAIJ_Update;
2822: inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update;
2823: switch (a->bs) {
2824: case 1:
2825: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2826: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1n");
2827: break;
2828: case 2:
2829: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2830: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2n");
2831: break;
2832: case 3:
2833: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2834: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3n");
2835: break;
2836: case 4:
2837: PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);
2838: if (sse_enabled_local) {
2839: #if defined(PETSC_HAVE_SSE)
2840: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2841: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4n");
2842: #else
2843: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
2844: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2845: #endif
2846: } else {
2847: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2848: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4n");
2849: }
2850: break;
2851: case 5:
2852: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2853: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5n");
2854: break;
2855: case 6:
2856: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2857: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6n");
2858: break;
2859: case 7:
2860: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2861: PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7n");
2862: break;
2863: }
2864: return(0);
2865: }
2867: int MatSeqBAIJ_UpdateSolvers(Mat A)
2868: {
2869: /*
2870: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2871: with natural ordering
2872: */
2873: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2874: IS row = a->row, col = a->col;
2875: PetscTruth row_identity, col_identity;
2876: PetscTruth sse_enabled_local, sse_enabled_global;
2877: PetscTruth use_single, use_natural;
2878: int ierr;
2882: use_single = PETSC_FALSE;
2883: use_natural = PETSC_FALSE;
2885: ISIdentity(row,&row_identity);
2886: ISIdentity(col,&col_identity);
2888: if (row_identity && col_identity) {
2889: use_natural = PETSC_TRUE;
2890: } else {
2891: use_natural = PETSC_FALSE;
2892: }
2893: switch (a->bs) {
2894: case 1:
2895: if (use_natural) {
2896: A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
2897: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
2898: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1n");
2899: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2900: } else {
2901: A->ops->solve = MatSolve_SeqBAIJ_1;
2902: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
2903: }
2904: break;
2905: case 2:
2906: if (use_natural) {
2907: A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
2908: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
2909: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2n");
2910: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2911: } else {
2912: A->ops->solve = MatSolve_SeqBAIJ_2;
2913: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
2914: }
2915: break;
2916: case 3:
2917: if (use_natural) {
2918: A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
2919: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
2920: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3n");
2921: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2922: } else {
2923: A->ops->solve = MatSolve_SeqBAIJ_3;
2924: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
2925: }
2926: break;
2927: case 4:
2928: PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);
2929: if (sse_enabled_local) {
2930: PetscTruth single_prec,flg;
2931: single_prec = flg = PETSC_FALSE;
2932: PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);
2933: if (flg) {
2934: a->single_precision_solves = single_prec;
2935: }
2936: if (a->single_precision_solves) {
2937: use_single = PETSC_TRUE;
2938: }
2939: } else {
2940: use_single = PETSC_FALSE;
2941: }
2942: if (use_natural) {
2943: if (use_single) {
2944: #if defined(PETSC_HAVE_SSE)
2945: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
2946: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4n");
2947: #else
2948: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
2949: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2950: #endif
2951: } else {
2952: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
2953: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2954: }
2955: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
2956: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2957: } else {
2958: if (use_single) {
2959: #if defined(PETSC_HAVE_SSE)
2960: A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion;
2961: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4n");
2962: #else
2963: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
2964: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2965: #endif
2966: } else {
2967: A->ops->solve = MatSolve_SeqBAIJ_4;
2968: }
2969: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
2970: }
2971: break;
2972: case 5:
2973: if (use_natural) {
2974: A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
2975: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
2976: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5n");
2977: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5n");
2978: } else {
2979: A->ops->solve = MatSolve_SeqBAIJ_5;
2980: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
2981: }
2982: break;
2983: case 6:
2984: if (use_natural) {
2985: A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
2986: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
2987: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6n");
2988: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6n");
2989: } else {
2990: A->ops->solve = MatSolve_SeqBAIJ_6;
2991: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
2992: }
2993: break;
2994: case 7:
2995: if (use_natural) {
2996: A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering;
2997: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
2998: PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7n");
2999: PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7n");
3000: } else {
3001: A->ops->solve = MatSolve_SeqBAIJ_7;
3002: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7;
3003: }
3004: break;
3005: default:
3006: A->ops->solve = MatSolve_SeqBAIJ_N;
3007: break;
3008: }
3009: return(0);
3010: }
3012: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3016: MatSeqBAIJ_UpdateSolvers(A);
3017: if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3018: (*A->ops->solve)(A,x,y);
3019: } else {
3020: SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3021: }
3022: return(0);
3023: }
3025: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3029: MatSeqBAIJ_UpdateSolvers(A);
3030: (*A->ops->solvetranspose)(A,x,y);
3031: return(0);
3032: }