Actual source code: baijfact2.c

  1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */

 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/vec/vecimpl.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 11: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 12: {
 13:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 14:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 15:   int             *diag = a->diag;
 16:   MatScalar       *aa=a->a,*v;
 17:   PetscScalar     s1,*x,*b;

 20:   VecCopy(bb,xx);
 21:   VecGetArray(bb,&b);
 22:   VecGetArray(xx,&x);
 23: 
 24:   /* forward solve the U^T */
 25:   for (i=0; i<n; i++) {

 27:     v     = aa + diag[i];
 28:     /* multiply by the inverse of the block diagonal */
 29:     s1    = (*v++)*x[i];
 30:     vi    = aj + diag[i] + 1;
 31:     nz    = ai[i+1] - diag[i] - 1;
 32:     while (nz--) {
 33:       x[*vi++]  -= (*v++)*s1;
 34:     }
 35:     x[i]   = s1;
 36:   }
 37:   /* backward solve the L^T */
 38:   for (i=n-1; i>=0; i--){
 39:     v    = aa + diag[i] - 1;
 40:     vi   = aj + diag[i] - 1;
 41:     nz   = diag[i] - ai[i];
 42:     s1   = x[i];
 43:     while (nz--) {
 44:       x[*vi--]   -=  (*v--)*s1;
 45:     }
 46:   }
 47:   VecRestoreArray(bb,&b);
 48:   VecRestoreArray(xx,&x);
 49:   PetscLogFlops(2*(a->nz) - A->n);
 50:   return(0);
 51: }

 53: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 54: {
 55:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 56:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 57:   int             *diag = a->diag,oidx;
 58:   MatScalar       *aa=a->a,*v;
 59:   PetscScalar     s1,s2,x1,x2;
 60:   PetscScalar     *x,*b;

 63:   VecCopy(bb,xx);
 64:   VecGetArray(bb,&b);
 65:   VecGetArray(xx,&x);

 67:   /* forward solve the U^T */
 68:   idx = 0;
 69:   for (i=0; i<n; i++) {

 71:     v     = aa + 4*diag[i];
 72:     /* multiply by the inverse of the block diagonal */
 73:     x1 = x[idx];   x2 = x[1+idx];
 74:     s1 = v[0]*x1  +  v[1]*x2;
 75:     s2 = v[2]*x1  +  v[3]*x2;
 76:     v += 4;

 78:     vi    = aj + diag[i] + 1;
 79:     nz    = ai[i+1] - diag[i] - 1;
 80:     while (nz--) {
 81:       oidx = 2*(*vi++);
 82:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 83:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 84:       v  += 4;
 85:     }
 86:     x[idx]   = s1;x[1+idx] = s2;
 87:     idx += 2;
 88:   }
 89:   /* backward solve the L^T */
 90:   for (i=n-1; i>=0; i--){
 91:     v    = aa + 4*diag[i] - 4;
 92:     vi   = aj + diag[i] - 1;
 93:     nz   = diag[i] - ai[i];
 94:     idt  = 2*i;
 95:     s1   = x[idt];  s2 = x[1+idt];
 96:     while (nz--) {
 97:       idx   = 2*(*vi--);
 98:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
 99:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
100:       v -= 4;
101:     }
102:   }
103:   VecRestoreArray(bb,&b);
104:   VecRestoreArray(xx,&x);
105:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
106:   return(0);
107: }

109: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
110: {
111:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
112:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
113:   int             *diag = a->diag,oidx;
114:   MatScalar       *aa=a->a,*v;
115:   PetscScalar     s1,s2,s3,x1,x2,x3;
116:   PetscScalar     *x,*b;

119:   VecCopy(bb,xx);
120:   VecGetArray(bb,&b);
121:   VecGetArray(xx,&x);

123:   /* forward solve the U^T */
124:   idx = 0;
125:   for (i=0; i<n; i++) {

127:     v     = aa + 9*diag[i];
128:     /* multiply by the inverse of the block diagonal */
129:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
130:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
131:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
132:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
133:     v += 9;

135:     vi    = aj + diag[i] + 1;
136:     nz    = ai[i+1] - diag[i] - 1;
137:     while (nz--) {
138:       oidx = 3*(*vi++);
139:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
140:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
141:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
142:       v  += 9;
143:     }
144:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
145:     idx += 3;
146:   }
147:   /* backward solve the L^T */
148:   for (i=n-1; i>=0; i--){
149:     v    = aa + 9*diag[i] - 9;
150:     vi   = aj + diag[i] - 1;
151:     nz   = diag[i] - ai[i];
152:     idt  = 3*i;
153:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
154:     while (nz--) {
155:       idx   = 3*(*vi--);
156:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
157:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
158:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
159:       v -= 9;
160:     }
161:   }
162:   VecRestoreArray(bb,&b);
163:   VecRestoreArray(xx,&x);
164:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
165:   return(0);
166: }

168: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
169: {
170:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
171:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
172:   int             *diag = a->diag,oidx;
173:   MatScalar       *aa=a->a,*v;
174:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
175:   PetscScalar     *x,*b;

178:   VecCopy(bb,xx);
179:   VecGetArray(bb,&b);
180:   VecGetArray(xx,&x);

182:   /* forward solve the U^T */
183:   idx = 0;
184:   for (i=0; i<n; i++) {

186:     v     = aa + 16*diag[i];
187:     /* multiply by the inverse of the block diagonal */
188:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
189:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
190:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
191:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
192:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
193:     v += 16;

195:     vi    = aj + diag[i] + 1;
196:     nz    = ai[i+1] - diag[i] - 1;
197:     while (nz--) {
198:       oidx = 4*(*vi++);
199:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
200:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
201:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
202:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
203:       v  += 16;
204:     }
205:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
206:     idx += 4;
207:   }
208:   /* backward solve the L^T */
209:   for (i=n-1; i>=0; i--){
210:     v    = aa + 16*diag[i] - 16;
211:     vi   = aj + diag[i] - 1;
212:     nz   = diag[i] - ai[i];
213:     idt  = 4*i;
214:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
215:     while (nz--) {
216:       idx   = 4*(*vi--);
217:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
218:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
219:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
220:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
221:       v -= 16;
222:     }
223:   }
224:   VecRestoreArray(bb,&b);
225:   VecRestoreArray(xx,&x);
226:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
227:   return(0);
228: }

230: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
231: {
232:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
233:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
234:   int             *diag = a->diag,oidx;
235:   MatScalar       *aa=a->a,*v;
236:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
237:   PetscScalar     *x,*b;

240:   VecCopy(bb,xx);
241:   VecGetArray(bb,&b);
242:   VecGetArray(xx,&x);

244:   /* forward solve the U^T */
245:   idx = 0;
246:   for (i=0; i<n; i++) {

248:     v     = aa + 25*diag[i];
249:     /* multiply by the inverse of the block diagonal */
250:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
251:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
252:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
253:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
254:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
255:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
256:     v += 25;

258:     vi    = aj + diag[i] + 1;
259:     nz    = ai[i+1] - diag[i] - 1;
260:     while (nz--) {
261:       oidx = 5*(*vi++);
262:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
263:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
264:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
265:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
266:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
267:       v  += 25;
268:     }
269:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
270:     idx += 5;
271:   }
272:   /* backward solve the L^T */
273:   for (i=n-1; i>=0; i--){
274:     v    = aa + 25*diag[i] - 25;
275:     vi   = aj + diag[i] - 1;
276:     nz   = diag[i] - ai[i];
277:     idt  = 5*i;
278:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
279:     while (nz--) {
280:       idx   = 5*(*vi--);
281:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
282:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
283:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
284:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
285:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
286:       v -= 25;
287:     }
288:   }
289:   VecRestoreArray(bb,&b);
290:   VecRestoreArray(xx,&x);
291:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
292:   return(0);
293: }

295: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
296: {
297:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
298:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
299:   int             *diag = a->diag,oidx;
300:   MatScalar       *aa=a->a,*v;
301:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
302:   PetscScalar     *x,*b;

305:   VecCopy(bb,xx);
306:   VecGetArray(bb,&b);
307:   VecGetArray(xx,&x);

309:   /* forward solve the U^T */
310:   idx = 0;
311:   for (i=0; i<n; i++) {

313:     v     = aa + 36*diag[i];
314:     /* multiply by the inverse of the block diagonal */
315:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
316:     x6    = x[5+idx];
317:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
318:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
319:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
320:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
321:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
322:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
323:     v += 36;

325:     vi    = aj + diag[i] + 1;
326:     nz    = ai[i+1] - diag[i] - 1;
327:     while (nz--) {
328:       oidx = 6*(*vi++);
329:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
330:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
331:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
332:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
333:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
334:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
335:       v  += 36;
336:     }
337:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
338:     x[5+idx] = s6;
339:     idx += 6;
340:   }
341:   /* backward solve the L^T */
342:   for (i=n-1; i>=0; i--){
343:     v    = aa + 36*diag[i] - 36;
344:     vi   = aj + diag[i] - 1;
345:     nz   = diag[i] - ai[i];
346:     idt  = 6*i;
347:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
348:     s6 = x[5+idt];
349:     while (nz--) {
350:       idx   = 6*(*vi--);
351:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
352:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
353:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
354:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
355:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
356:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
357:       v -= 36;
358:     }
359:   }
360:   VecRestoreArray(bb,&b);
361:   VecRestoreArray(xx,&x);
362:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
363:   return(0);
364: }

366: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
367: {
368:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
369:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
370:   int             *diag = a->diag,oidx;
371:   MatScalar       *aa=a->a,*v;
372:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
373:   PetscScalar     *x,*b;

376:   VecCopy(bb,xx);
377:   VecGetArray(bb,&b);
378:   VecGetArray(xx,&x);

380:   /* forward solve the U^T */
381:   idx = 0;
382:   for (i=0; i<n; i++) {

384:     v     = aa + 49*diag[i];
385:     /* multiply by the inverse of the block diagonal */
386:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
387:     x6    = x[5+idx]; x7 = x[6+idx];
388:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
389:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
390:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
391:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
392:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
393:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
394:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
395:     v += 49;

397:     vi    = aj + diag[i] + 1;
398:     nz    = ai[i+1] - diag[i] - 1;
399:     while (nz--) {
400:       oidx = 7*(*vi++);
401:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
402:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
403:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
404:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
405:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
406:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
407:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
408:       v  += 49;
409:     }
410:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
411:     x[5+idx] = s6;x[6+idx] = s7;
412:     idx += 7;
413:   }
414:   /* backward solve the L^T */
415:   for (i=n-1; i>=0; i--){
416:     v    = aa + 49*diag[i] - 49;
417:     vi   = aj + diag[i] - 1;
418:     nz   = diag[i] - ai[i];
419:     idt  = 7*i;
420:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
421:     s6 = x[5+idt];s7 = x[6+idt];
422:     while (nz--) {
423:       idx   = 7*(*vi--);
424:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431:       v -= 49;
432:     }
433:   }
434:   VecRestoreArray(bb,&b);
435:   VecRestoreArray(xx,&x);
436:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
437:   return(0);
438: }

440: /*---------------------------------------------------------------------------------------------*/
441: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
442: {
443:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
444:   IS              iscol=a->col,isrow=a->row;
445:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
446:   int             *diag = a->diag;
447:   MatScalar       *aa=a->a,*v;
448:   PetscScalar     s1,*x,*b,*t;

451:   VecGetArray(bb,&b);
452:   VecGetArray(xx,&x);
453:   t  = a->solve_work;

455:   ISGetIndices(isrow,&rout); r = rout;
456:   ISGetIndices(iscol,&cout); c = cout;

458:   /* copy the b into temp work space according to permutation */
459:   for (i=0; i<n; i++) {
460:     t[i] = b[c[i]];
461:   }

463:   /* forward solve the U^T */
464:   for (i=0; i<n; i++) {

466:     v     = aa + diag[i];
467:     /* multiply by the inverse of the block diagonal */
468:     s1    = (*v++)*t[i];
469:     vi    = aj + diag[i] + 1;
470:     nz    = ai[i+1] - diag[i] - 1;
471:     while (nz--) {
472:       t[*vi++]  -= (*v++)*s1;
473:     }
474:     t[i]   = s1;
475:   }
476:   /* backward solve the L^T */
477:   for (i=n-1; i>=0; i--){
478:     v    = aa + diag[i] - 1;
479:     vi   = aj + diag[i] - 1;
480:     nz   = diag[i] - ai[i];
481:     s1   = t[i];
482:     while (nz--) {
483:       t[*vi--]   -=  (*v--)*s1;
484:     }
485:   }

487:   /* copy t into x according to permutation */
488:   for (i=0; i<n; i++) {
489:     x[r[i]]   = t[i];
490:   }

492:   ISRestoreIndices(isrow,&rout);
493:   ISRestoreIndices(iscol,&cout);
494:   VecRestoreArray(bb,&b);
495:   VecRestoreArray(xx,&x);
496:   PetscLogFlops(2*(a->nz) - A->n);
497:   return(0);
498: }

500: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
501: {
502:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
503:   IS              iscol=a->col,isrow=a->row;
504:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
505:   int             *diag = a->diag,ii,ic,ir,oidx;
506:   MatScalar       *aa=a->a,*v;
507:   PetscScalar     s1,s2,x1,x2;
508:   PetscScalar     *x,*b,*t;

511:   VecGetArray(bb,&b);
512:   VecGetArray(xx,&x);
513:   t  = a->solve_work;

515:   ISGetIndices(isrow,&rout); r = rout;
516:   ISGetIndices(iscol,&cout); c = cout;

518:   /* copy the b into temp work space according to permutation */
519:   ii = 0;
520:   for (i=0; i<n; i++) {
521:     ic      = 2*c[i];
522:     t[ii]   = b[ic];
523:     t[ii+1] = b[ic+1];
524:     ii += 2;
525:   }

527:   /* forward solve the U^T */
528:   idx = 0;
529:   for (i=0; i<n; i++) {

531:     v     = aa + 4*diag[i];
532:     /* multiply by the inverse of the block diagonal */
533:     x1    = t[idx];   x2 = t[1+idx];
534:     s1 = v[0]*x1  +  v[1]*x2;
535:     s2 = v[2]*x1  +  v[3]*x2;
536:     v += 4;

538:     vi    = aj + diag[i] + 1;
539:     nz    = ai[i+1] - diag[i] - 1;
540:     while (nz--) {
541:       oidx = 2*(*vi++);
542:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
543:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
544:       v  += 4;
545:     }
546:     t[idx]   = s1;t[1+idx] = s2;
547:     idx += 2;
548:   }
549:   /* backward solve the L^T */
550:   for (i=n-1; i>=0; i--){
551:     v    = aa + 4*diag[i] - 4;
552:     vi   = aj + diag[i] - 1;
553:     nz   = diag[i] - ai[i];
554:     idt  = 2*i;
555:     s1 = t[idt];  s2 = t[1+idt];
556:     while (nz--) {
557:       idx   = 2*(*vi--);
558:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
559:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
560:       v -= 4;
561:     }
562:   }

564:   /* copy t into x according to permutation */
565:   ii = 0;
566:   for (i=0; i<n; i++) {
567:     ir      = 2*r[i];
568:     x[ir]   = t[ii];
569:     x[ir+1] = t[ii+1];
570:     ii += 2;
571:   }

573:   ISRestoreIndices(isrow,&rout);
574:   ISRestoreIndices(iscol,&cout);
575:   VecRestoreArray(bb,&b);
576:   VecRestoreArray(xx,&x);
577:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
578:   return(0);
579: }

581: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
582: {
583:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
584:   IS              iscol=a->col,isrow=a->row;
585:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
586:   int             *diag = a->diag,ii,ic,ir,oidx;
587:   MatScalar       *aa=a->a,*v;
588:   PetscScalar     s1,s2,s3,x1,x2,x3;
589:   PetscScalar     *x,*b,*t;

592:   VecGetArray(bb,&b);
593:   VecGetArray(xx,&x);
594:   t  = a->solve_work;

596:   ISGetIndices(isrow,&rout); r = rout;
597:   ISGetIndices(iscol,&cout); c = cout;

599:   /* copy the b into temp work space according to permutation */
600:   ii = 0;
601:   for (i=0; i<n; i++) {
602:     ic      = 3*c[i];
603:     t[ii]   = b[ic];
604:     t[ii+1] = b[ic+1];
605:     t[ii+2] = b[ic+2];
606:     ii += 3;
607:   }

609:   /* forward solve the U^T */
610:   idx = 0;
611:   for (i=0; i<n; i++) {

613:     v     = aa + 9*diag[i];
614:     /* multiply by the inverse of the block diagonal */
615:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
616:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
617:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
618:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
619:     v += 9;

621:     vi    = aj + diag[i] + 1;
622:     nz    = ai[i+1] - diag[i] - 1;
623:     while (nz--) {
624:       oidx = 3*(*vi++);
625:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
626:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
627:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
628:       v  += 9;
629:     }
630:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
631:     idx += 3;
632:   }
633:   /* backward solve the L^T */
634:   for (i=n-1; i>=0; i--){
635:     v    = aa + 9*diag[i] - 9;
636:     vi   = aj + diag[i] - 1;
637:     nz   = diag[i] - ai[i];
638:     idt  = 3*i;
639:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
640:     while (nz--) {
641:       idx   = 3*(*vi--);
642:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
643:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
644:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
645:       v -= 9;
646:     }
647:   }

649:   /* copy t into x according to permutation */
650:   ii = 0;
651:   for (i=0; i<n; i++) {
652:     ir      = 3*r[i];
653:     x[ir]   = t[ii];
654:     x[ir+1] = t[ii+1];
655:     x[ir+2] = t[ii+2];
656:     ii += 3;
657:   }

659:   ISRestoreIndices(isrow,&rout);
660:   ISRestoreIndices(iscol,&cout);
661:   VecRestoreArray(bb,&b);
662:   VecRestoreArray(xx,&x);
663:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
664:   return(0);
665: }

667: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
668: {
669:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
670:   IS              iscol=a->col,isrow=a->row;
671:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
672:   int             *diag = a->diag,ii,ic,ir,oidx;
673:   MatScalar       *aa=a->a,*v;
674:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
675:   PetscScalar     *x,*b,*t;

678:   VecGetArray(bb,&b);
679:   VecGetArray(xx,&x);
680:   t  = a->solve_work;

682:   ISGetIndices(isrow,&rout); r = rout;
683:   ISGetIndices(iscol,&cout); c = cout;

685:   /* copy the b into temp work space according to permutation */
686:   ii = 0;
687:   for (i=0; i<n; i++) {
688:     ic      = 4*c[i];
689:     t[ii]   = b[ic];
690:     t[ii+1] = b[ic+1];
691:     t[ii+2] = b[ic+2];
692:     t[ii+3] = b[ic+3];
693:     ii += 4;
694:   }

696:   /* forward solve the U^T */
697:   idx = 0;
698:   for (i=0; i<n; i++) {

700:     v     = aa + 16*diag[i];
701:     /* multiply by the inverse of the block diagonal */
702:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
703:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
704:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
705:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
706:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
707:     v += 16;

709:     vi    = aj + diag[i] + 1;
710:     nz    = ai[i+1] - diag[i] - 1;
711:     while (nz--) {
712:       oidx = 4*(*vi++);
713:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
714:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
715:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
716:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
717:       v  += 16;
718:     }
719:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
720:     idx += 4;
721:   }
722:   /* backward solve the L^T */
723:   for (i=n-1; i>=0; i--){
724:     v    = aa + 16*diag[i] - 16;
725:     vi   = aj + diag[i] - 1;
726:     nz   = diag[i] - ai[i];
727:     idt  = 4*i;
728:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
729:     while (nz--) {
730:       idx   = 4*(*vi--);
731:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
732:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
733:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
734:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
735:       v -= 16;
736:     }
737:   }

739:   /* copy t into x according to permutation */
740:   ii = 0;
741:   for (i=0; i<n; i++) {
742:     ir      = 4*r[i];
743:     x[ir]   = t[ii];
744:     x[ir+1] = t[ii+1];
745:     x[ir+2] = t[ii+2];
746:     x[ir+3] = t[ii+3];
747:     ii += 4;
748:   }

750:   ISRestoreIndices(isrow,&rout);
751:   ISRestoreIndices(iscol,&cout);
752:   VecRestoreArray(bb,&b);
753:   VecRestoreArray(xx,&x);
754:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
755:   return(0);
756: }

758: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
759: {
760:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
761:   IS              iscol=a->col,isrow=a->row;
762:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
763:   int             *diag = a->diag,ii,ic,ir,oidx;
764:   MatScalar       *aa=a->a,*v;
765:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
766:   PetscScalar     *x,*b,*t;

769:   VecGetArray(bb,&b);
770:   VecGetArray(xx,&x);
771:   t  = a->solve_work;

773:   ISGetIndices(isrow,&rout); r = rout;
774:   ISGetIndices(iscol,&cout); c = cout;

776:   /* copy the b into temp work space according to permutation */
777:   ii = 0;
778:   for (i=0; i<n; i++) {
779:     ic      = 5*c[i];
780:     t[ii]   = b[ic];
781:     t[ii+1] = b[ic+1];
782:     t[ii+2] = b[ic+2];
783:     t[ii+3] = b[ic+3];
784:     t[ii+4] = b[ic+4];
785:     ii += 5;
786:   }

788:   /* forward solve the U^T */
789:   idx = 0;
790:   for (i=0; i<n; i++) {

792:     v     = aa + 25*diag[i];
793:     /* multiply by the inverse of the block diagonal */
794:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
795:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
796:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
797:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
798:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
799:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
800:     v += 25;

802:     vi    = aj + diag[i] + 1;
803:     nz    = ai[i+1] - diag[i] - 1;
804:     while (nz--) {
805:       oidx = 5*(*vi++);
806:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
807:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
808:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
809:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
810:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
811:       v  += 25;
812:     }
813:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
814:     idx += 5;
815:   }
816:   /* backward solve the L^T */
817:   for (i=n-1; i>=0; i--){
818:     v    = aa + 25*diag[i] - 25;
819:     vi   = aj + diag[i] - 1;
820:     nz   = diag[i] - ai[i];
821:     idt  = 5*i;
822:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
823:     while (nz--) {
824:       idx   = 5*(*vi--);
825:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
826:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
827:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
828:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
829:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
830:       v -= 25;
831:     }
832:   }

834:   /* copy t into x according to permutation */
835:   ii = 0;
836:   for (i=0; i<n; i++) {
837:     ir      = 5*r[i];
838:     x[ir]   = t[ii];
839:     x[ir+1] = t[ii+1];
840:     x[ir+2] = t[ii+2];
841:     x[ir+3] = t[ii+3];
842:     x[ir+4] = t[ii+4];
843:     ii += 5;
844:   }

846:   ISRestoreIndices(isrow,&rout);
847:   ISRestoreIndices(iscol,&cout);
848:   VecRestoreArray(bb,&b);
849:   VecRestoreArray(xx,&x);
850:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
851:   return(0);
852: }

854: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
855: {
856:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
857:   IS              iscol=a->col,isrow=a->row;
858:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
859:   int             *diag = a->diag,ii,ic,ir,oidx;
860:   MatScalar       *aa=a->a,*v;
861:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
862:   PetscScalar     *x,*b,*t;

865:   VecGetArray(bb,&b);
866:   VecGetArray(xx,&x);
867:   t  = a->solve_work;

869:   ISGetIndices(isrow,&rout); r = rout;
870:   ISGetIndices(iscol,&cout); c = cout;

872:   /* copy the b into temp work space according to permutation */
873:   ii = 0;
874:   for (i=0; i<n; i++) {
875:     ic      = 6*c[i];
876:     t[ii]   = b[ic];
877:     t[ii+1] = b[ic+1];
878:     t[ii+2] = b[ic+2];
879:     t[ii+3] = b[ic+3];
880:     t[ii+4] = b[ic+4];
881:     t[ii+5] = b[ic+5];
882:     ii += 6;
883:   }

885:   /* forward solve the U^T */
886:   idx = 0;
887:   for (i=0; i<n; i++) {

889:     v     = aa + 36*diag[i];
890:     /* multiply by the inverse of the block diagonal */
891:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
892:     x6    = t[5+idx];
893:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
894:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
895:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
896:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
897:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
898:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
899:     v += 36;

901:     vi    = aj + diag[i] + 1;
902:     nz    = ai[i+1] - diag[i] - 1;
903:     while (nz--) {
904:       oidx = 6*(*vi++);
905:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
906:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
907:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
908:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
909:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
910:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
911:       v  += 36;
912:     }
913:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
914:     t[5+idx] = s6;
915:     idx += 6;
916:   }
917:   /* backward solve the L^T */
918:   for (i=n-1; i>=0; i--){
919:     v    = aa + 36*diag[i] - 36;
920:     vi   = aj + diag[i] - 1;
921:     nz   = diag[i] - ai[i];
922:     idt  = 6*i;
923:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
924:     s6 = t[5+idt];
925:     while (nz--) {
926:       idx   = 6*(*vi--);
927:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
928:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
929:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
930:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
931:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
932:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
933:       v -= 36;
934:     }
935:   }

937:   /* copy t into x according to permutation */
938:   ii = 0;
939:   for (i=0; i<n; i++) {
940:     ir      = 6*r[i];
941:     x[ir]   = t[ii];
942:     x[ir+1] = t[ii+1];
943:     x[ir+2] = t[ii+2];
944:     x[ir+3] = t[ii+3];
945:     x[ir+4] = t[ii+4];
946:     x[ir+5] = t[ii+5];
947:     ii += 6;
948:   }

950:   ISRestoreIndices(isrow,&rout);
951:   ISRestoreIndices(iscol,&cout);
952:   VecRestoreArray(bb,&b);
953:   VecRestoreArray(xx,&x);
954:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
955:   return(0);
956: }

958: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
959: {
960:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
961:   IS              iscol=a->col,isrow=a->row;
962:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
963:   int             *diag = a->diag,ii,ic,ir,oidx;
964:   MatScalar       *aa=a->a,*v;
965:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
966:   PetscScalar     *x,*b,*t;

969:   VecGetArray(bb,&b);
970:   VecGetArray(xx,&x);
971:   t  = a->solve_work;

973:   ISGetIndices(isrow,&rout); r = rout;
974:   ISGetIndices(iscol,&cout); c = cout;

976:   /* copy the b into temp work space according to permutation */
977:   ii = 0;
978:   for (i=0; i<n; i++) {
979:     ic      = 7*c[i];
980:     t[ii]   = b[ic];
981:     t[ii+1] = b[ic+1];
982:     t[ii+2] = b[ic+2];
983:     t[ii+3] = b[ic+3];
984:     t[ii+4] = b[ic+4];
985:     t[ii+5] = b[ic+5];
986:     t[ii+6] = b[ic+6];
987:     ii += 7;
988:   }

990:   /* forward solve the U^T */
991:   idx = 0;
992:   for (i=0; i<n; i++) {

994:     v     = aa + 49*diag[i];
995:     /* multiply by the inverse of the block diagonal */
996:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
997:     x6    = t[5+idx]; x7 = t[6+idx];
998:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
999:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1000:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1001:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1002:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1003:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1004:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1005:     v += 49;

1007:     vi    = aj + diag[i] + 1;
1008:     nz    = ai[i+1] - diag[i] - 1;
1009:     while (nz--) {
1010:       oidx = 7*(*vi++);
1011:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1012:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1013:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1014:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1015:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1016:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1017:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1018:       v  += 49;
1019:     }
1020:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1021:     t[5+idx] = s6;t[6+idx] = s7;
1022:     idx += 7;
1023:   }
1024:   /* backward solve the L^T */
1025:   for (i=n-1; i>=0; i--){
1026:     v    = aa + 49*diag[i] - 49;
1027:     vi   = aj + diag[i] - 1;
1028:     nz   = diag[i] - ai[i];
1029:     idt  = 7*i;
1030:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1031:     s6 = t[5+idt];s7 = t[6+idt];
1032:     while (nz--) {
1033:       idx   = 7*(*vi--);
1034:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1035:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1036:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1037:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1038:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1039:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1040:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1041:       v -= 49;
1042:     }
1043:   }

1045:   /* copy t into x according to permutation */
1046:   ii = 0;
1047:   for (i=0; i<n; i++) {
1048:     ir      = 7*r[i];
1049:     x[ir]   = t[ii];
1050:     x[ir+1] = t[ii+1];
1051:     x[ir+2] = t[ii+2];
1052:     x[ir+3] = t[ii+3];
1053:     x[ir+4] = t[ii+4];
1054:     x[ir+5] = t[ii+5];
1055:     x[ir+6] = t[ii+6];
1056:     ii += 7;
1057:   }

1059:   ISRestoreIndices(isrow,&rout);
1060:   ISRestoreIndices(iscol,&cout);
1061:   VecRestoreArray(bb,&b);
1062:   VecRestoreArray(xx,&x);
1063:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1064:   return(0);
1065: }

1067: /* ----------------------------------------------------------- */
1068: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1069: {
1070:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1071:   IS              iscol=a->col,isrow=a->row;
1072:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1073:   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1074:   MatScalar       *aa=a->a,*v;
1075:   PetscScalar     *x,*b,*s,*t,*ls;

1078:   VecGetArray(bb,&b);
1079:   VecGetArray(xx,&x);
1080:   t  = a->solve_work;

1082:   ISGetIndices(isrow,&rout); r = rout;
1083:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1085:   /* forward solve the lower triangular */
1086:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1087:   for (i=1; i<n; i++) {
1088:     v   = aa + bs2*ai[i];
1089:     vi  = aj + ai[i];
1090:     nz  = a->diag[i] - ai[i];
1091:     s = t + bs*i;
1092:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1093:     while (nz--) {
1094:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1095:       v += bs2;
1096:     }
1097:   }
1098:   /* backward solve the upper triangular */
1099:   ls = a->solve_work + A->n;
1100:   for (i=n-1; i>=0; i--){
1101:     v   = aa + bs2*(a->diag[i] + 1);
1102:     vi  = aj + a->diag[i] + 1;
1103:     nz  = ai[i+1] - a->diag[i] - 1;
1104:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1105:     while (nz--) {
1106:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1107:       v += bs2;
1108:     }
1109:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1110:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1111:   }

1113:   ISRestoreIndices(isrow,&rout);
1114:   ISRestoreIndices(iscol,&cout);
1115:   VecRestoreArray(bb,&b);
1116:   VecRestoreArray(xx,&x);
1117:   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1118:   return(0);
1119: }

1121: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1122: {
1123:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1124:   IS              iscol=a->col,isrow=a->row;
1125:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1126:   int             *diag = a->diag;
1127:   MatScalar       *aa=a->a,*v;
1128:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1129:   PetscScalar     *x,*b,*t;

1132:   VecGetArray(bb,&b);
1133:   VecGetArray(xx,&x);
1134:   t  = a->solve_work;

1136:   ISGetIndices(isrow,&rout); r = rout;
1137:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1139:   /* forward solve the lower triangular */
1140:   idx    = 7*(*r++);
1141:   t[0] = b[idx];   t[1] = b[1+idx];
1142:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1143:   t[5] = b[5+idx]; t[6] = b[6+idx];

1145:   for (i=1; i<n; i++) {
1146:     v     = aa + 49*ai[i];
1147:     vi    = aj + ai[i];
1148:     nz    = diag[i] - ai[i];
1149:     idx   = 7*(*r++);
1150:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1151:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1152:     while (nz--) {
1153:       idx   = 7*(*vi++);
1154:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1155:       x4    = t[3+idx];x5 = t[4+idx];
1156:       x6    = t[5+idx];x7 = t[6+idx];
1157:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1158:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1159:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1160:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1161:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1162:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1163:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1164:       v += 49;
1165:     }
1166:     idx = 7*i;
1167:     t[idx]   = s1;t[1+idx] = s2;
1168:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1169:     t[5+idx] = s6;t[6+idx] = s7;
1170:   }
1171:   /* backward solve the upper triangular */
1172:   for (i=n-1; i>=0; i--){
1173:     v    = aa + 49*diag[i] + 49;
1174:     vi   = aj + diag[i] + 1;
1175:     nz   = ai[i+1] - diag[i] - 1;
1176:     idt  = 7*i;
1177:     s1 = t[idt];  s2 = t[1+idt];
1178:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1179:     s6 = t[5+idt];s7 = t[6+idt];
1180:     while (nz--) {
1181:       idx   = 7*(*vi++);
1182:       x1    = t[idx];   x2 = t[1+idx];
1183:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1184:       x6    = t[5+idx]; x7 = t[6+idx];
1185:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1186:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1187:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1188:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1189:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1190:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1191:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1192:       v += 49;
1193:     }
1194:     idc = 7*(*c--);
1195:     v   = aa + 49*diag[i];
1196:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1197:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1198:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1199:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1200:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1201:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1202:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1203:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1204:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1205:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1206:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1207:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1208:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1209:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1210:   }

1212:   ISRestoreIndices(isrow,&rout);
1213:   ISRestoreIndices(iscol,&cout);
1214:   VecRestoreArray(bb,&b);
1215:   VecRestoreArray(xx,&x);
1216:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1217:   return(0);
1218: }

1220: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1221: {
1222:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1223:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1224:   int             ierr,*diag = a->diag,jdx;
1225:   MatScalar       *aa=a->a,*v;
1226:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1229:   VecGetArray(bb,&b);
1230:   VecGetArray(xx,&x);
1231:   /* forward solve the lower triangular */
1232:   idx    = 0;
1233:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1234:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1235:   x[6] = b[6+idx];
1236:   for (i=1; i<n; i++) {
1237:     v     =  aa + 49*ai[i];
1238:     vi    =  aj + ai[i];
1239:     nz    =  diag[i] - ai[i];
1240:     idx   =  7*i;
1241:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1242:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1243:     s7  =  b[6+idx];
1244:     while (nz--) {
1245:       jdx   = 7*(*vi++);
1246:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1247:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1248:       x7    = x[6+jdx];
1249:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1250:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1251:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1252:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1253:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1254:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1255:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1256:       v += 49;
1257:      }
1258:     x[idx]   = s1;
1259:     x[1+idx] = s2;
1260:     x[2+idx] = s3;
1261:     x[3+idx] = s4;
1262:     x[4+idx] = s5;
1263:     x[5+idx] = s6;
1264:     x[6+idx] = s7;
1265:   }
1266:   /* backward solve the upper triangular */
1267:   for (i=n-1; i>=0; i--){
1268:     v    = aa + 49*diag[i] + 49;
1269:     vi   = aj + diag[i] + 1;
1270:     nz   = ai[i+1] - diag[i] - 1;
1271:     idt  = 7*i;
1272:     s1 = x[idt];   s2 = x[1+idt];
1273:     s3 = x[2+idt]; s4 = x[3+idt];
1274:     s5 = x[4+idt]; s6 = x[5+idt];
1275:     s7 = x[6+idt];
1276:     while (nz--) {
1277:       idx   = 7*(*vi++);
1278:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1279:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1280:       x7    = x[6+idx];
1281:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1282:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1283:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1284:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1285:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1286:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1287:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1288:       v += 49;
1289:     }
1290:     v        = aa + 49*diag[i];
1291:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1292:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1293:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1294:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1295:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1296:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1297:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1298:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1299:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1300:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1301:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1302:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1303:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1304:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1305:   }

1307:   VecRestoreArray(bb,&b);
1308:   VecRestoreArray(xx,&x);
1309:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1310:   return(0);
1311: }

1313: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1314: {
1315:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1316:   IS              iscol=a->col,isrow=a->row;
1317:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1318:   int             *diag = a->diag;
1319:   MatScalar       *aa=a->a,*v;
1320:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1323:   VecGetArray(bb,&b);
1324:   VecGetArray(xx,&x);
1325:   t  = a->solve_work;

1327:   ISGetIndices(isrow,&rout); r = rout;
1328:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1330:   /* forward solve the lower triangular */
1331:   idx    = 6*(*r++);
1332:   t[0] = b[idx];   t[1] = b[1+idx];
1333:   t[2] = b[2+idx]; t[3] = b[3+idx];
1334:   t[4] = b[4+idx]; t[5] = b[5+idx];
1335:   for (i=1; i<n; i++) {
1336:     v     = aa + 36*ai[i];
1337:     vi    = aj + ai[i];
1338:     nz    = diag[i] - ai[i];
1339:     idx   = 6*(*r++);
1340:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1341:     s5  = b[4+idx]; s6 = b[5+idx];
1342:     while (nz--) {
1343:       idx   = 6*(*vi++);
1344:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1345:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1346:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1347:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1348:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1349:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1350:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1351:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1352:       v += 36;
1353:     }
1354:     idx = 6*i;
1355:     t[idx]   = s1;t[1+idx] = s2;
1356:     t[2+idx] = s3;t[3+idx] = s4;
1357:     t[4+idx] = s5;t[5+idx] = s6;
1358:   }
1359:   /* backward solve the upper triangular */
1360:   for (i=n-1; i>=0; i--){
1361:     v    = aa + 36*diag[i] + 36;
1362:     vi   = aj + diag[i] + 1;
1363:     nz   = ai[i+1] - diag[i] - 1;
1364:     idt  = 6*i;
1365:     s1 = t[idt];  s2 = t[1+idt];
1366:     s3 = t[2+idt];s4 = t[3+idt];
1367:     s5 = t[4+idt];s6 = t[5+idt];
1368:     while (nz--) {
1369:       idx   = 6*(*vi++);
1370:       x1    = t[idx];   x2 = t[1+idx];
1371:       x3    = t[2+idx]; x4 = t[3+idx];
1372:       x5    = t[4+idx]; x6 = t[5+idx];
1373:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1374:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1375:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1376:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1377:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1378:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1379:       v += 36;
1380:     }
1381:     idc = 6*(*c--);
1382:     v   = aa + 36*diag[i];
1383:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1384:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1385:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1386:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1387:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1388:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1389:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1390:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1391:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1392:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1393:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1394:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1395:   }

1397:   ISRestoreIndices(isrow,&rout);
1398:   ISRestoreIndices(iscol,&cout);
1399:   VecRestoreArray(bb,&b);
1400:   VecRestoreArray(xx,&x);
1401:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1402:   return(0);
1403: }

1405: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1406: {
1407:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1408:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1409:   int             ierr,*diag = a->diag,jdx;
1410:   MatScalar       *aa=a->a,*v;
1411:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1414:   VecGetArray(bb,&b);
1415:   VecGetArray(xx,&x);
1416:   /* forward solve the lower triangular */
1417:   idx    = 0;
1418:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1419:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1420:   for (i=1; i<n; i++) {
1421:     v     =  aa + 36*ai[i];
1422:     vi    =  aj + ai[i];
1423:     nz    =  diag[i] - ai[i];
1424:     idx   =  6*i;
1425:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1426:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1427:     while (nz--) {
1428:       jdx   = 6*(*vi++);
1429:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1430:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1431:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1432:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1433:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1434:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1435:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1436:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1437:       v += 36;
1438:      }
1439:     x[idx]   = s1;
1440:     x[1+idx] = s2;
1441:     x[2+idx] = s3;
1442:     x[3+idx] = s4;
1443:     x[4+idx] = s5;
1444:     x[5+idx] = s6;
1445:   }
1446:   /* backward solve the upper triangular */
1447:   for (i=n-1; i>=0; i--){
1448:     v    = aa + 36*diag[i] + 36;
1449:     vi   = aj + diag[i] + 1;
1450:     nz   = ai[i+1] - diag[i] - 1;
1451:     idt  = 6*i;
1452:     s1 = x[idt];   s2 = x[1+idt];
1453:     s3 = x[2+idt]; s4 = x[3+idt];
1454:     s5 = x[4+idt]; s6 = x[5+idt];
1455:     while (nz--) {
1456:       idx   = 6*(*vi++);
1457:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1458:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1459:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1460:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1461:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1462:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1463:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1464:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1465:       v += 36;
1466:     }
1467:     v        = aa + 36*diag[i];
1468:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1469:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1470:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1471:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1472:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1473:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1474:   }

1476:   VecRestoreArray(bb,&b);
1477:   VecRestoreArray(xx,&x);
1478:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1479:   return(0);
1480: }

1482: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1483: {
1484:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1485:   IS              iscol=a->col,isrow=a->row;
1486:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1487:   int             *diag = a->diag;
1488:   MatScalar       *aa=a->a,*v;
1489:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1492:   VecGetArray(bb,&b);
1493:   VecGetArray(xx,&x);
1494:   t  = a->solve_work;

1496:   ISGetIndices(isrow,&rout); r = rout;
1497:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1499:   /* forward solve the lower triangular */
1500:   idx    = 5*(*r++);
1501:   t[0] = b[idx];   t[1] = b[1+idx];
1502:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1503:   for (i=1; i<n; i++) {
1504:     v     = aa + 25*ai[i];
1505:     vi    = aj + ai[i];
1506:     nz    = diag[i] - ai[i];
1507:     idx   = 5*(*r++);
1508:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1509:     s5  = b[4+idx];
1510:     while (nz--) {
1511:       idx   = 5*(*vi++);
1512:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1513:       x4    = t[3+idx];x5 = t[4+idx];
1514:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1515:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1516:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1517:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1518:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1519:       v += 25;
1520:     }
1521:     idx = 5*i;
1522:     t[idx]   = s1;t[1+idx] = s2;
1523:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1524:   }
1525:   /* backward solve the upper triangular */
1526:   for (i=n-1; i>=0; i--){
1527:     v    = aa + 25*diag[i] + 25;
1528:     vi   = aj + diag[i] + 1;
1529:     nz   = ai[i+1] - diag[i] - 1;
1530:     idt  = 5*i;
1531:     s1 = t[idt];  s2 = t[1+idt];
1532:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1533:     while (nz--) {
1534:       idx   = 5*(*vi++);
1535:       x1    = t[idx];   x2 = t[1+idx];
1536:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1537:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1538:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1539:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1540:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1541:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1542:       v += 25;
1543:     }
1544:     idc = 5*(*c--);
1545:     v   = aa + 25*diag[i];
1546:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1547:                                  v[15]*s4+v[20]*s5;
1548:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1549:                                  v[16]*s4+v[21]*s5;
1550:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1551:                                  v[17]*s4+v[22]*s5;
1552:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1553:                                  v[18]*s4+v[23]*s5;
1554:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1555:                                  v[19]*s4+v[24]*s5;
1556:   }

1558:   ISRestoreIndices(isrow,&rout);
1559:   ISRestoreIndices(iscol,&cout);
1560:   VecRestoreArray(bb,&b);
1561:   VecRestoreArray(xx,&x);
1562:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1563:   return(0);
1564: }

1566: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1567: {
1568:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1569:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1570:   int             ierr,*diag = a->diag,jdx;
1571:   MatScalar       *aa=a->a,*v;
1572:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1575:   VecGetArray(bb,&b);
1576:   VecGetArray(xx,&x);
1577:   /* forward solve the lower triangular */
1578:   idx    = 0;
1579:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1580:   for (i=1; i<n; i++) {
1581:     v     =  aa + 25*ai[i];
1582:     vi    =  aj + ai[i];
1583:     nz    =  diag[i] - ai[i];
1584:     idx   =  5*i;
1585:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1586:     while (nz--) {
1587:       jdx   = 5*(*vi++);
1588:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1589:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1590:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1591:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1592:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1593:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1594:       v    += 25;
1595:     }
1596:     x[idx]   = s1;
1597:     x[1+idx] = s2;
1598:     x[2+idx] = s3;
1599:     x[3+idx] = s4;
1600:     x[4+idx] = s5;
1601:   }
1602:   /* backward solve the upper triangular */
1603:   for (i=n-1; i>=0; i--){
1604:     v    = aa + 25*diag[i] + 25;
1605:     vi   = aj + diag[i] + 1;
1606:     nz   = ai[i+1] - diag[i] - 1;
1607:     idt  = 5*i;
1608:     s1 = x[idt];  s2 = x[1+idt];
1609:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1610:     while (nz--) {
1611:       idx   = 5*(*vi++);
1612:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1613:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1614:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1615:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1616:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1617:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1618:       v    += 25;
1619:     }
1620:     v        = aa + 25*diag[i];
1621:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1622:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1623:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1624:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1625:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1626:   }

1628:   VecRestoreArray(bb,&b);
1629:   VecRestoreArray(xx,&x);
1630:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1631:   return(0);
1632: }

1634: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1635: {
1636:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1637:   IS              iscol=a->col,isrow=a->row;
1638:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1639:   int             *diag = a->diag;
1640:   MatScalar       *aa=a->a,*v;
1641:   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1644:   VecGetArray(bb,&b);
1645:   VecGetArray(xx,&x);
1646:   t  = a->solve_work;

1648:   ISGetIndices(isrow,&rout); r = rout;
1649:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1651:   /* forward solve the lower triangular */
1652:   idx    = 4*(*r++);
1653:   t[0] = b[idx];   t[1] = b[1+idx];
1654:   t[2] = b[2+idx]; t[3] = b[3+idx];
1655:   for (i=1; i<n; i++) {
1656:     v     = aa + 16*ai[i];
1657:     vi    = aj + ai[i];
1658:     nz    = diag[i] - ai[i];
1659:     idx   = 4*(*r++);
1660:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1661:     while (nz--) {
1662:       idx   = 4*(*vi++);
1663:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1664:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1665:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1666:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1667:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1668:       v    += 16;
1669:     }
1670:     idx        = 4*i;
1671:     t[idx]   = s1;t[1+idx] = s2;
1672:     t[2+idx] = s3;t[3+idx] = s4;
1673:   }
1674:   /* backward solve the upper triangular */
1675:   for (i=n-1; i>=0; i--){
1676:     v    = aa + 16*diag[i] + 16;
1677:     vi   = aj + diag[i] + 1;
1678:     nz   = ai[i+1] - diag[i] - 1;
1679:     idt  = 4*i;
1680:     s1 = t[idt];  s2 = t[1+idt];
1681:     s3 = t[2+idt];s4 = t[3+idt];
1682:     while (nz--) {
1683:       idx   = 4*(*vi++);
1684:       x1    = t[idx];   x2 = t[1+idx];
1685:       x3    = t[2+idx]; x4 = t[3+idx];
1686:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1687:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1688:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1689:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1690:       v += 16;
1691:     }
1692:     idc      = 4*(*c--);
1693:     v        = aa + 16*diag[i];
1694:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1695:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1696:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1697:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1698:   }

1700:   ISRestoreIndices(isrow,&rout);
1701:   ISRestoreIndices(iscol,&cout);
1702:   VecRestoreArray(bb,&b);
1703:   VecRestoreArray(xx,&x);
1704:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1705:   return(0);
1706: }
1707: #if defined (PETSC_HAVE_SSE)

1709: #include PETSC_HAVE_SSE

1711: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1712: {
1713:   /* 
1714:      Note: This code uses demotion of double
1715:      to float when performing the mixed-mode computation.
1716:      This may not be numerically reasonable for all applications.
1717:   */
1718:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1719:   IS              iscol=a->col,isrow=a->row;
1720:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1721:   int             *diag = a->diag,ai16;
1722:   MatScalar       *aa=a->a,*v;
1723:   PetscScalar     *x,*b,*t;

1725:   /* Make space in temp stack for 16 Byte Aligned arrays */
1726:   float           ssealignedspace[11],*tmps,*tmpx;
1727:   unsigned long   offset;
1728: 
1730:   SSE_SCOPE_BEGIN;

1732:     offset = (unsigned long)ssealignedspace % 16;
1733:     if (offset) offset = (16 - offset)/4;
1734:     tmps = &ssealignedspace[offset];
1735:     tmpx = &ssealignedspace[offset+4];
1736:     PREFETCH_NTA(aa+16*ai[1]);

1738:     VecGetArray(bb,&b);
1739:     VecGetArray(xx,&x);
1740:     t  = a->solve_work;

1742:     ISGetIndices(isrow,&rout); r = rout;
1743:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1745:     /* forward solve the lower triangular */
1746:     idx  = 4*(*r++);
1747:     t[0] = b[idx];   t[1] = b[1+idx];
1748:     t[2] = b[2+idx]; t[3] = b[3+idx];
1749:     v    =  aa + 16*ai[1];

1751:     for (i=1; i<n;) {
1752:       PREFETCH_NTA(&v[8]);
1753:       vi   =  aj      + ai[i];
1754:       nz   =  diag[i] - ai[i];
1755:       idx  =  4*(*r++);

1757:       /* Demote sum from double to float */
1758:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1759:       LOAD_PS(tmps,XMM7);

1761:       while (nz--) {
1762:         PREFETCH_NTA(&v[16]);
1763:         idx = 4*(*vi++);
1764: 
1765:         /* Demote solution (so far) from double to float */
1766:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1768:         /* 4x4 Matrix-Vector product with negative accumulation: */
1769:         SSE_INLINE_BEGIN_2(tmpx,v)
1770:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1772:           /* First Column */
1773:           SSE_COPY_PS(XMM0,XMM6)
1774:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1775:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1776:           SSE_SUB_PS(XMM7,XMM0)
1777: 
1778:           /* Second Column */
1779:           SSE_COPY_PS(XMM1,XMM6)
1780:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1781:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1782:           SSE_SUB_PS(XMM7,XMM1)
1783: 
1784:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1785: 
1786:           /* Third Column */
1787:           SSE_COPY_PS(XMM2,XMM6)
1788:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1789:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1790:           SSE_SUB_PS(XMM7,XMM2)

1792:           /* Fourth Column */
1793:           SSE_COPY_PS(XMM3,XMM6)
1794:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1795:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1796:           SSE_SUB_PS(XMM7,XMM3)
1797:         SSE_INLINE_END_2
1798: 
1799:         v  += 16;
1800:       }
1801:       idx = 4*i;
1802:       v   = aa + 16*ai[++i];
1803:       PREFETCH_NTA(v);
1804:       STORE_PS(tmps,XMM7);

1806:       /* Promote result from float to double */
1807:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1808:     }
1809:     /* backward solve the upper triangular */
1810:     idt  = 4*(n-1);
1811:     ai16 = 16*diag[n-1];
1812:     v    = aa + ai16 + 16;
1813:     for (i=n-1; i>=0;){
1814:       PREFETCH_NTA(&v[8]);
1815:       vi = aj + diag[i] + 1;
1816:       nz = ai[i+1] - diag[i] - 1;
1817: 
1818:       /* Demote accumulator from double to float */
1819:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1820:       LOAD_PS(tmps,XMM7);

1822:       while (nz--) {
1823:         PREFETCH_NTA(&v[16]);
1824:         idx = 4*(*vi++);

1826:         /* Demote solution (so far) from double to float */
1827:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1829:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1830:         SSE_INLINE_BEGIN_2(tmpx,v)
1831:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1833:           /* First Column */
1834:           SSE_COPY_PS(XMM0,XMM6)
1835:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1836:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1837:           SSE_SUB_PS(XMM7,XMM0)

1839:           /* Second Column */
1840:           SSE_COPY_PS(XMM1,XMM6)
1841:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1842:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1843:           SSE_SUB_PS(XMM7,XMM1)

1845:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1846: 
1847:           /* Third Column */
1848:           SSE_COPY_PS(XMM2,XMM6)
1849:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1850:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1851:           SSE_SUB_PS(XMM7,XMM2)

1853:           /* Fourth Column */
1854:           SSE_COPY_PS(XMM3,XMM6)
1855:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1856:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1857:           SSE_SUB_PS(XMM7,XMM3)
1858:         SSE_INLINE_END_2
1859:         v  += 16;
1860:       }
1861:       v    = aa + ai16;
1862:       ai16 = 16*diag[--i];
1863:       PREFETCH_NTA(aa+ai16+16);
1864:       /* 
1865:          Scale the result by the diagonal 4x4 block, 
1866:          which was inverted as part of the factorization
1867:       */
1868:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
1869:         /* First Column */
1870:         SSE_COPY_PS(XMM0,XMM7)
1871:         SSE_SHUFFLE(XMM0,XMM0,0x00)
1872:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

1874:         /* Second Column */
1875:         SSE_COPY_PS(XMM1,XMM7)
1876:         SSE_SHUFFLE(XMM1,XMM1,0x55)
1877:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
1878:         SSE_ADD_PS(XMM0,XMM1)

1880:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
1881: 
1882:         /* Third Column */
1883:         SSE_COPY_PS(XMM2,XMM7)
1884:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
1885:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
1886:         SSE_ADD_PS(XMM0,XMM2)

1888:         /* Fourth Column */
1889:         SSE_COPY_PS(XMM3,XMM7)
1890:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
1891:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
1892:         SSE_ADD_PS(XMM0,XMM3)
1893: 
1894:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
1895:       SSE_INLINE_END_3

1897:       /* Promote solution from float to double */
1898:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

1900:       /* Apply reordering to t and stream into x.    */
1901:       /* This way, x doesn't pollute the cache.      */
1902:       /* Be careful with size: 2 doubles = 4 floats! */
1903:       idc  = 4*(*c--);
1904:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
1905:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1906:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
1907:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
1908:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1909:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
1910:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
1911:       SSE_INLINE_END_2
1912:       v    = aa + ai16 + 16;
1913:       idt -= 4;
1914:     }

1916:     ISRestoreIndices(isrow,&rout);
1917:     ISRestoreIndices(iscol,&cout);
1918:     VecRestoreArray(bb,&b);
1919:     VecRestoreArray(xx,&x);
1920:     PetscLogFlops(2*16*(a->nz) - 4*A->n);
1921:   SSE_SCOPE_END;
1922:   return(0);
1923: }

1925: #endif


1928: /*
1929:       Special case where the matrix was ILU(0) factored in the natural
1930:    ordering. This eliminates the need for the column and row permutation.
1931: */
1932: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1933: {
1934:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1935:   int             n=a->mbs,*ai=a->i,*aj=a->j;
1936:   int             ierr,*diag = a->diag;
1937:   MatScalar       *aa=a->a;
1938:   PetscScalar     *x,*b;

1941:   VecGetArray(bb,&b);
1942:   VecGetArray(xx,&x);

1944: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1945:   {
1946:     static PetscScalar w[2000]; /* very BAD need to fix */
1947:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1948:   }
1949: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1950:   {
1951:     static PetscScalar w[2000]; /* very BAD need to fix */
1952:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1953:   }
1954: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1955:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1956: #else
1957:   {
1958:     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
1959:     MatScalar    *v;
1960:     int          jdx,idt,idx,nz,*vi,i,ai16;

1962:   /* forward solve the lower triangular */
1963:   idx    = 0;
1964:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1965:   for (i=1; i<n; i++) {
1966:     v     =  aa      + 16*ai[i];
1967:     vi    =  aj      + ai[i];
1968:     nz    =  diag[i] - ai[i];
1969:     idx   +=  4;
1970:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1971:     while (nz--) {
1972:       jdx   = 4*(*vi++);
1973:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1974:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1975:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1976:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1977:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1978:       v    += 16;
1979:     }
1980:     x[idx]   = s1;
1981:     x[1+idx] = s2;
1982:     x[2+idx] = s3;
1983:     x[3+idx] = s4;
1984:   }
1985:   /* backward solve the upper triangular */
1986:   idt = 4*(n-1);
1987:   for (i=n-1; i>=0; i--){
1988:     ai16 = 16*diag[i];
1989:     v    = aa + ai16 + 16;
1990:     vi   = aj + diag[i] + 1;
1991:     nz   = ai[i+1] - diag[i] - 1;
1992:     s1 = x[idt];  s2 = x[1+idt];
1993:     s3 = x[2+idt];s4 = x[3+idt];
1994:     while (nz--) {
1995:       idx   = 4*(*vi++);
1996:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1997:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1998:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1999:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2000:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2001:       v    += 16;
2002:     }
2003:     v        = aa + ai16;
2004:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2005:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2006:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2007:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2008:     idt -= 4;
2009:   }
2010:   }
2011: #endif

2013:   VecRestoreArray(bb,&b);
2014:   VecRestoreArray(xx,&x);
2015:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2016:   return(0);
2017: }

2019: #if defined (PETSC_HAVE_SSE)

2021: #include PETSC_HAVE_SSE

2023: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2024: {
2025:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2026:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2027:   int             ierr,*diag = a->diag;
2028:   MatScalar       *aa=a->a;
2029:   PetscScalar     *x,*b;

2032:   SSE_SCOPE_BEGIN;
2033:   /* 
2034:      Note: This code currently uses demotion of double
2035:      to float when performing the mixed-mode computation.
2036:      This may not be numerically reasonable for all applications.
2037:   */
2038:   PREFETCH_NTA(aa+16*ai[1]);

2040:   VecGetArray(bb,&b);
2041:   VecGetArray(xx,&x);
2042:   {
2043:     MatScalar     *v;
2044:     int           jdx,idt,idx,nz,*vi,i,ai16;

2046:     /* Make space in temp stack for 16 Byte Aligned arrays */
2047:     float         ssealignedspace[11],*tmps,*tmpx;
2048:     unsigned long offset = (unsigned long)ssealignedspace % 16;
2049:     if (offset) offset = (16 - offset)/4;
2050:     tmps = &ssealignedspace[offset];
2051:     tmpx = &ssealignedspace[offset+4];

2053:   /* forward solve the lower triangular */
2054:     idx  = 0;
2055:     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2056:     v    =  aa + 16*ai[1];

2058:     for (i=1; i<n;) {
2059:       PREFETCH_NTA(&v[8]);
2060:       vi   =  aj      + ai[i];
2061:       nz   =  diag[i] - ai[i];
2062:       idx +=  4;

2064:     /* Demote sum from double to float */
2065:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2066:       LOAD_PS(tmps,XMM7);

2068:       while (nz--) {
2069:         PREFETCH_NTA(&v[16]);
2070:         jdx = 4*(*vi++);
2071: 
2072:         /* Demote solution (so far) from double to float */
2073:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);

2075:         /* 4x4 Matrix-Vector product with negative accumulation: */
2076:         SSE_INLINE_BEGIN_2(tmpx,v)
2077:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2079:           /* First Column */
2080:           SSE_COPY_PS(XMM0,XMM6)
2081:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2082:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2083:           SSE_SUB_PS(XMM7,XMM0)

2085:           /* Second Column */
2086:           SSE_COPY_PS(XMM1,XMM6)
2087:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2088:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2089:           SSE_SUB_PS(XMM7,XMM1)

2091:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2092: 
2093:           /* Third Column */
2094:           SSE_COPY_PS(XMM2,XMM6)
2095:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2096:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2097:           SSE_SUB_PS(XMM7,XMM2)

2099:           /* Fourth Column */
2100:           SSE_COPY_PS(XMM3,XMM6)
2101:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2102:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2103:           SSE_SUB_PS(XMM7,XMM3)
2104:         SSE_INLINE_END_2
2105: 
2106:         v  += 16;
2107:       }
2108:       v    =  aa + 16*ai[++i];
2109:       PREFETCH_NTA(v);
2110:       STORE_PS(tmps,XMM7);

2112:       /* Promote result from float to double */
2113:       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
2114:     }
2115:     /* backward solve the upper triangular */
2116:     idt  = 4*(n-1);
2117:     ai16 = 16*diag[n-1];
2118:     v    = aa + ai16 + 16;
2119:     for (i=n-1; i>=0;){
2120:       PREFETCH_NTA(&v[8]);
2121:       vi = aj + diag[i] + 1;
2122:       nz = ai[i+1] - diag[i] - 1;
2123: 
2124:       /* Demote accumulator from double to float */
2125:       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
2126:       LOAD_PS(tmps,XMM7);

2128:       while (nz--) {
2129:         PREFETCH_NTA(&v[16]);
2130:         idx = 4*(*vi++);

2132:         /* Demote solution (so far) from double to float */
2133:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

2135:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2136:         SSE_INLINE_BEGIN_2(tmpx,v)
2137:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2139:           /* First Column */
2140:           SSE_COPY_PS(XMM0,XMM6)
2141:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2142:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2143:           SSE_SUB_PS(XMM7,XMM0)

2145:           /* Second Column */
2146:           SSE_COPY_PS(XMM1,XMM6)
2147:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2148:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2149:           SSE_SUB_PS(XMM7,XMM1)

2151:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2152: 
2153:           /* Third Column */
2154:           SSE_COPY_PS(XMM2,XMM6)
2155:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2156:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2157:           SSE_SUB_PS(XMM7,XMM2)

2159:           /* Fourth Column */
2160:           SSE_COPY_PS(XMM3,XMM6)
2161:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2162:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2163:           SSE_SUB_PS(XMM7,XMM3)
2164:         SSE_INLINE_END_2
2165:         v  += 16;
2166:       }
2167:       v    = aa + ai16;
2168:       ai16 = 16*diag[--i];
2169:       PREFETCH_NTA(aa+ai16+16);
2170:       /* 
2171:          Scale the result by the diagonal 4x4 block, 
2172:          which was inverted as part of the factorization
2173:       */
2174:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2175:         /* First Column */
2176:         SSE_COPY_PS(XMM0,XMM7)
2177:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2178:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2180:         /* Second Column */
2181:         SSE_COPY_PS(XMM1,XMM7)
2182:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2183:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2184:         SSE_ADD_PS(XMM0,XMM1)

2186:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2187: 
2188:         /* Third Column */
2189:         SSE_COPY_PS(XMM2,XMM7)
2190:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2191:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2192:         SSE_ADD_PS(XMM0,XMM2)

2194:         /* Fourth Column */
2195:         SSE_COPY_PS(XMM3,XMM7)
2196:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2197:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2198:         SSE_ADD_PS(XMM0,XMM3)

2200:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2201:       SSE_INLINE_END_3

2203:       /* Promote solution from float to double */
2204:       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);

2206:       v    = aa + ai16 + 16;
2207:       idt -= 4;
2208:     }
2209:   }
2210:   VecRestoreArray(bb,&b);
2211:   VecRestoreArray(xx,&x);
2212:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2213:   SSE_SCOPE_END;
2214:   return(0);
2215: }

2217: #endif
2218: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2219: {
2220:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2221:   IS              iscol=a->col,isrow=a->row;
2222:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2223:   int             *diag = a->diag;
2224:   MatScalar       *aa=a->a,*v;
2225:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;

2228:   VecGetArray(bb,&b);
2229:   VecGetArray(xx,&x);
2230:   t  = a->solve_work;

2232:   ISGetIndices(isrow,&rout); r = rout;
2233:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2235:   /* forward solve the lower triangular */
2236:   idx    = 3*(*r++);
2237:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2238:   for (i=1; i<n; i++) {
2239:     v     = aa + 9*ai[i];
2240:     vi    = aj + ai[i];
2241:     nz    = diag[i] - ai[i];
2242:     idx   = 3*(*r++);
2243:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2244:     while (nz--) {
2245:       idx   = 3*(*vi++);
2246:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2247:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2248:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2249:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2250:       v += 9;
2251:     }
2252:     idx = 3*i;
2253:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2254:   }
2255:   /* backward solve the upper triangular */
2256:   for (i=n-1; i>=0; i--){
2257:     v    = aa + 9*diag[i] + 9;
2258:     vi   = aj + diag[i] + 1;
2259:     nz   = ai[i+1] - diag[i] - 1;
2260:     idt  = 3*i;
2261:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2262:     while (nz--) {
2263:       idx   = 3*(*vi++);
2264:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2265:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2266:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2267:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2268:       v += 9;
2269:     }
2270:     idc = 3*(*c--);
2271:     v   = aa + 9*diag[i];
2272:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2273:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2274:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2275:   }
2276:   ISRestoreIndices(isrow,&rout);
2277:   ISRestoreIndices(iscol,&cout);
2278:   VecRestoreArray(bb,&b);
2279:   VecRestoreArray(xx,&x);
2280:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2281:   return(0);
2282: }

2284: /*
2285:       Special case where the matrix was ILU(0) factored in the natural
2286:    ordering. This eliminates the need for the column and row permutation.
2287: */
2288: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2289: {
2290:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2291:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2292:   int             ierr,*diag = a->diag;
2293:   MatScalar       *aa=a->a,*v;
2294:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2295:   int             jdx,idt,idx,nz,*vi,i;

2298:   VecGetArray(bb,&b);
2299:   VecGetArray(xx,&x);


2302:   /* forward solve the lower triangular */
2303:   idx    = 0;
2304:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2305:   for (i=1; i<n; i++) {
2306:     v     =  aa      + 9*ai[i];
2307:     vi    =  aj      + ai[i];
2308:     nz    =  diag[i] - ai[i];
2309:     idx   +=  3;
2310:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2311:     while (nz--) {
2312:       jdx   = 3*(*vi++);
2313:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2314:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2315:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2316:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2317:       v    += 9;
2318:     }
2319:     x[idx]   = s1;
2320:     x[1+idx] = s2;
2321:     x[2+idx] = s3;
2322:   }
2323:   /* backward solve the upper triangular */
2324:   for (i=n-1; i>=0; i--){
2325:     v    = aa + 9*diag[i] + 9;
2326:     vi   = aj + diag[i] + 1;
2327:     nz   = ai[i+1] - diag[i] - 1;
2328:     idt  = 3*i;
2329:     s1 = x[idt];  s2 = x[1+idt];
2330:     s3 = x[2+idt];
2331:     while (nz--) {
2332:       idx   = 3*(*vi++);
2333:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2334:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2335:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2336:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2337:       v    += 9;
2338:     }
2339:     v        = aa +  9*diag[i];
2340:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2341:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2342:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2343:   }

2345:   VecRestoreArray(bb,&b);
2346:   VecRestoreArray(xx,&x);
2347:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2348:   return(0);
2349: }

2351: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2352: {
2353:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2354:   IS              iscol=a->col,isrow=a->row;
2355:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2356:   int             *diag = a->diag;
2357:   MatScalar       *aa=a->a,*v;
2358:   PetscScalar     *x,*b,s1,s2,x1,x2,*t;

2361:   VecGetArray(bb,&b);
2362:   VecGetArray(xx,&x);
2363:   t  = a->solve_work;

2365:   ISGetIndices(isrow,&rout); r = rout;
2366:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2368:   /* forward solve the lower triangular */
2369:   idx    = 2*(*r++);
2370:   t[0] = b[idx]; t[1] = b[1+idx];
2371:   for (i=1; i<n; i++) {
2372:     v     = aa + 4*ai[i];
2373:     vi    = aj + ai[i];
2374:     nz    = diag[i] - ai[i];
2375:     idx   = 2*(*r++);
2376:     s1  = b[idx]; s2 = b[1+idx];
2377:     while (nz--) {
2378:       idx   = 2*(*vi++);
2379:       x1    = t[idx]; x2 = t[1+idx];
2380:       s1 -= v[0]*x1 + v[2]*x2;
2381:       s2 -= v[1]*x1 + v[3]*x2;
2382:       v += 4;
2383:     }
2384:     idx = 2*i;
2385:     t[idx] = s1; t[1+idx] = s2;
2386:   }
2387:   /* backward solve the upper triangular */
2388:   for (i=n-1; i>=0; i--){
2389:     v    = aa + 4*diag[i] + 4;
2390:     vi   = aj + diag[i] + 1;
2391:     nz   = ai[i+1] - diag[i] - 1;
2392:     idt  = 2*i;
2393:     s1 = t[idt]; s2 = t[1+idt];
2394:     while (nz--) {
2395:       idx   = 2*(*vi++);
2396:       x1    = t[idx]; x2 = t[1+idx];
2397:       s1 -= v[0]*x1 + v[2]*x2;
2398:       s2 -= v[1]*x1 + v[3]*x2;
2399:       v += 4;
2400:     }
2401:     idc = 2*(*c--);
2402:     v   = aa + 4*diag[i];
2403:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2404:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2405:   }
2406:   ISRestoreIndices(isrow,&rout);
2407:   ISRestoreIndices(iscol,&cout);
2408:   VecRestoreArray(bb,&b);
2409:   VecRestoreArray(xx,&x);
2410:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2411:   return(0);
2412: }

2414: /*
2415:       Special case where the matrix was ILU(0) factored in the natural
2416:    ordering. This eliminates the need for the column and row permutation.
2417: */
2418: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2419: {
2420:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2421:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2422:   int             ierr,*diag = a->diag;
2423:   MatScalar       *aa=a->a,*v;
2424:   PetscScalar     *x,*b,s1,s2,x1,x2;
2425:   int             jdx,idt,idx,nz,*vi,i;

2428:   VecGetArray(bb,&b);
2429:   VecGetArray(xx,&x);

2431:   /* forward solve the lower triangular */
2432:   idx    = 0;
2433:   x[0]   = b[0]; x[1] = b[1];
2434:   for (i=1; i<n; i++) {
2435:     v     =  aa      + 4*ai[i];
2436:     vi    =  aj      + ai[i];
2437:     nz    =  diag[i] - ai[i];
2438:     idx   +=  2;
2439:     s1  =  b[idx];s2 = b[1+idx];
2440:     while (nz--) {
2441:       jdx   = 2*(*vi++);
2442:       x1    = x[jdx];x2 = x[1+jdx];
2443:       s1 -= v[0]*x1 + v[2]*x2;
2444:       s2 -= v[1]*x1 + v[3]*x2;
2445:       v    += 4;
2446:     }
2447:     x[idx]   = s1;
2448:     x[1+idx] = s2;
2449:   }
2450:   /* backward solve the upper triangular */
2451:   for (i=n-1; i>=0; i--){
2452:     v    = aa + 4*diag[i] + 4;
2453:     vi   = aj + diag[i] + 1;
2454:     nz   = ai[i+1] - diag[i] - 1;
2455:     idt  = 2*i;
2456:     s1 = x[idt];  s2 = x[1+idt];
2457:     while (nz--) {
2458:       idx   = 2*(*vi++);
2459:       x1    = x[idx];   x2 = x[1+idx];
2460:       s1 -= v[0]*x1 + v[2]*x2;
2461:       s2 -= v[1]*x1 + v[3]*x2;
2462:       v    += 4;
2463:     }
2464:     v        = aa +  4*diag[i];
2465:     x[idt]   = v[0]*s1 + v[2]*s2;
2466:     x[1+idt] = v[1]*s1 + v[3]*s2;
2467:   }

2469:   VecRestoreArray(bb,&b);
2470:   VecRestoreArray(xx,&x);
2471:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2472:   return(0);
2473: }

2475: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2476: {
2477:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2478:   IS              iscol=a->col,isrow=a->row;
2479:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2480:   int             *diag = a->diag;
2481:   MatScalar       *aa=a->a,*v;
2482:   PetscScalar     *x,*b,s1,*t;

2485:   if (!n) return(0);

2487:   VecGetArray(bb,&b);
2488:   VecGetArray(xx,&x);
2489:   t  = a->solve_work;

2491:   ISGetIndices(isrow,&rout); r = rout;
2492:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2494:   /* forward solve the lower triangular */
2495:   t[0] = b[*r++];
2496:   for (i=1; i<n; i++) {
2497:     v     = aa + ai[i];
2498:     vi    = aj + ai[i];
2499:     nz    = diag[i] - ai[i];
2500:     s1  = b[*r++];
2501:     while (nz--) {
2502:       s1 -= (*v++)*t[*vi++];
2503:     }
2504:     t[i] = s1;
2505:   }
2506:   /* backward solve the upper triangular */
2507:   for (i=n-1; i>=0; i--){
2508:     v    = aa + diag[i] + 1;
2509:     vi   = aj + diag[i] + 1;
2510:     nz   = ai[i+1] - diag[i] - 1;
2511:     s1 = t[i];
2512:     while (nz--) {
2513:       s1 -= (*v++)*t[*vi++];
2514:     }
2515:     x[*c--] = t[i] = aa[diag[i]]*s1;
2516:   }

2518:   ISRestoreIndices(isrow,&rout);
2519:   ISRestoreIndices(iscol,&cout);
2520:   VecRestoreArray(bb,&b);
2521:   VecRestoreArray(xx,&x);
2522:   PetscLogFlops(2*1*(a->nz) - A->n);
2523:   return(0);
2524: }
2525: /*
2526:       Special case where the matrix was ILU(0) factored in the natural
2527:    ordering. This eliminates the need for the column and row permutation.
2528: */
2529: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2530: {
2531:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2532:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2533:   int             ierr,*diag = a->diag;
2534:   MatScalar       *aa=a->a;
2535:   PetscScalar     *x,*b;
2536:   PetscScalar     s1,x1;
2537:   MatScalar       *v;
2538:   int             jdx,idt,idx,nz,*vi,i;

2541:   VecGetArray(bb,&b);
2542:   VecGetArray(xx,&x);

2544:   /* forward solve the lower triangular */
2545:   idx    = 0;
2546:   x[0]   = b[0];
2547:   for (i=1; i<n; i++) {
2548:     v     =  aa      + ai[i];
2549:     vi    =  aj      + ai[i];
2550:     nz    =  diag[i] - ai[i];
2551:     idx   +=  1;
2552:     s1  =  b[idx];
2553:     while (nz--) {
2554:       jdx   = *vi++;
2555:       x1    = x[jdx];
2556:       s1 -= v[0]*x1;
2557:       v    += 1;
2558:     }
2559:     x[idx]   = s1;
2560:   }
2561:   /* backward solve the upper triangular */
2562:   for (i=n-1; i>=0; i--){
2563:     v    = aa + diag[i] + 1;
2564:     vi   = aj + diag[i] + 1;
2565:     nz   = ai[i+1] - diag[i] - 1;
2566:     idt  = i;
2567:     s1 = x[idt];
2568:     while (nz--) {
2569:       idx   = *vi++;
2570:       x1    = x[idx];
2571:       s1 -= v[0]*x1;
2572:       v    += 1;
2573:     }
2574:     v        = aa +  diag[i];
2575:     x[idt]   = v[0]*s1;
2576:   }
2577:   VecRestoreArray(bb,&b);
2578:   VecRestoreArray(xx,&x);
2579:   PetscLogFlops(2*(a->nz) - A->n);
2580:   return(0);
2581: }

2583: /* ----------------------------------------------------------------*/
2584: /*
2585:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2586:    except that the data structure of Mat_SeqAIJ is slightly different.
2587:    Not a good example of code reuse.
2588: */
2589: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);

2591: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2592: {
2593:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2594:   IS          isicol;
2595:   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2596:   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2597:   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2598:   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2599:   PetscTruth  col_identity,row_identity;
2600:   PetscReal   f;

2603:   if (info) {
2604:     f             = info->fill;
2605:     levels        = (int)info->levels;
2606:     diagonal_fill = (int)info->diagonal_fill;
2607:   } else {
2608:     f             = 1.0;
2609:     levels        = 0;
2610:     diagonal_fill = 0;
2611:   }
2612:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
2613:   ISIdentity(isrow,&row_identity);
2614:   ISIdentity(iscol,&col_identity);

2616:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2617:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
2618:     (*fact)->factor = FACTOR_LU;
2619:     b               = (Mat_SeqBAIJ*)(*fact)->data;
2620:     if (!b->diag) {
2621:       MatMarkDiagonal_SeqBAIJ(*fact);
2622:     }
2623:     MatMissingDiagonal_SeqBAIJ(*fact);
2624:     b->row        = isrow;
2625:     b->col        = iscol;
2626:     ierr          = PetscObjectReference((PetscObject)isrow);
2627:     ierr          = PetscObjectReference((PetscObject)iscol);
2628:     b->icol       = isicol;
2629:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
2630:     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
2631:   } else { /* general case perform the symbolic factorization */
2632:     ISGetIndices(isrow,&r);
2633:     ISGetIndices(isicol,&ic);

2635:     /* get new row pointers */
2636:     PetscMalloc((n+1)*sizeof(int),&ainew);
2637:     ainew[0] = 0;
2638:     /* don't know how many column pointers are needed so estimate */
2639:     jmax = (int)(f*ai[n] + 1);
2640:     PetscMalloc((jmax)*sizeof(int),&ajnew);
2641:     /* ajfill is level of fill for each fill entry */
2642:     PetscMalloc((jmax)*sizeof(int),&ajfill);
2643:     /* fill is a linked list of nonzeros in active row */
2644:     PetscMalloc((n+1)*sizeof(int),&fill);
2645:     /* im is level for each filled value */
2646:     PetscMalloc((n+1)*sizeof(int),&im);
2647:     /* dloc is location of diagonal in factor */
2648:     PetscMalloc((n+1)*sizeof(int),&dloc);
2649:     dloc[0]  = 0;
2650:     for (prow=0; prow<n; prow++) {

2652:       /* copy prow into linked list */
2653:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2654:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2655:       xi         = aj + ai[r[prow]];
2656:       fill[n]    = n;
2657:       fill[prow] = -1; /* marker for diagonal entry */
2658:       while (nz--) {
2659:         fm  = n;
2660:         idx = ic[*xi++];
2661:         do {
2662:           m  = fm;
2663:           fm = fill[m];
2664:         } while (fm < idx);
2665:         fill[m]   = idx;
2666:         fill[idx] = fm;
2667:         im[idx]   = 0;
2668:       }

2670:       /* make sure diagonal entry is included */
2671:       if (diagonal_fill && fill[prow] == -1) {
2672:         fm = n;
2673:         while (fill[fm] < prow) fm = fill[fm];
2674:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
2675:         fill[fm]   = prow;
2676:         im[prow]   = 0;
2677:         nzf++;
2678:         dcount++;
2679:       }

2681:       nzi = 0;
2682:       row = fill[n];
2683:       while (row < prow) {
2684:         incrlev = im[row] + 1;
2685:         nz      = dloc[row];
2686:         xi      = ajnew  + ainew[row] + nz + 1;
2687:         flev    = ajfill + ainew[row] + nz + 1;
2688:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
2689:         fm      = row;
2690:         while (nnz-- > 0) {
2691:           idx = *xi++;
2692:           if (*flev + incrlev > levels) {
2693:             flev++;
2694:             continue;
2695:           }
2696:           do {
2697:             m  = fm;
2698:             fm = fill[m];
2699:           } while (fm < idx);
2700:           if (fm != idx) {
2701:             im[idx]   = *flev + incrlev;
2702:             fill[m]   = idx;
2703:             fill[idx] = fm;
2704:             fm        = idx;
2705:             nzf++;
2706:           } else {
2707:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2708:           }
2709:           flev++;
2710:         }
2711:         row = fill[row];
2712:         nzi++;
2713:       }
2714:       /* copy new filled row into permanent storage */
2715:       ainew[prow+1] = ainew[prow] + nzf;
2716:       if (ainew[prow+1] > jmax) {

2718:         /* estimate how much additional space we will need */
2719:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2720:         /* just double the memory each time */
2721:         int maxadd = jmax;
2722:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2723:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2724:         jmax += maxadd;

2726:         /* allocate a longer ajnew and ajfill */
2727:         PetscMalloc(jmax*sizeof(int),&xi);
2728:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
2729:         PetscFree(ajnew);
2730:         ajnew = xi;
2731:         PetscMalloc(jmax*sizeof(int),&xi);
2732:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
2733:         PetscFree(ajfill);
2734:         ajfill = xi;
2735:         realloc++; /* count how many reallocations are needed */
2736:       }
2737:       xi          = ajnew + ainew[prow];
2738:       flev        = ajfill + ainew[prow];
2739:       dloc[prow]  = nzi;
2740:       fm          = fill[n];
2741:       while (nzf--) {
2742:         *xi++   = fm;
2743:         *flev++ = im[fm];
2744:         fm      = fill[fm];
2745:       }
2746:       /* make sure row has diagonal entry */
2747:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2748:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrixn
2749:     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2750:       }
2751:     }
2752:     PetscFree(ajfill);
2753:     ISRestoreIndices(isrow,&r);
2754:     ISRestoreIndices(isicol,&ic);
2755:     PetscFree(fill);
2756:     PetscFree(im);

2758:     {
2759:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2760:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
2761:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use n",af);
2762:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);n",af);
2763:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.n");
2764:       if (diagonal_fill) {
2765:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
2766:       }
2767:     }

2769:     /* put together the new matrix */
2770:     MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);
2771:     PetscLogObjectParent(*fact,isicol);
2772:     b = (Mat_SeqBAIJ*)(*fact)->data;
2773:     PetscFree(b->imax);
2774:     b->singlemalloc = PETSC_FALSE;
2775:     len = bs2*ainew[n]*sizeof(MatScalar);
2776:     /* the next line frees the default space generated by the Create() */
2777:     PetscFree(b->a);
2778:     PetscFree(b->ilen);
2779:     PetscMalloc(len,&b->a);
2780:     b->j          = ajnew;
2781:     b->i          = ainew;
2782:     for (i=0; i<n; i++) dloc[i] += ainew[i];
2783:     b->diag       = dloc;
2784:     b->ilen       = 0;
2785:     b->imax       = 0;
2786:     b->row        = isrow;
2787:     b->col        = iscol;
2788:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
2789:     ierr          = PetscObjectReference((PetscObject)isrow);
2790:     ierr          = PetscObjectReference((PetscObject)iscol);
2791:     b->icol       = isicol;
2792:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
2793:     /* In b structure:  Free imax, ilen, old a, old j.  
2794:        Allocate dloc, solve_work, new a, new j */
2795:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
2796:     b->maxnz          = b->nz = ainew[n];
2797:     (*fact)->factor   = FACTOR_LU;

2799:     (*fact)->info.factor_mallocs    = realloc;
2800:     (*fact)->info.fill_ratio_given  = f;
2801:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2802:   }

2804:   if (row_identity && col_identity) {
2805:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
2806:   }
2807:   return(0);
2808: }

2810: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
2811: {
2812:   /*
2813:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
2814:       with natural ordering
2815:   */
2816:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2817:   PetscTruth  sse_enabled_local,sse_enabled_global;
2818:   int         ierr;

2821:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
2822:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
2823:   switch (a->bs) {
2824:   case 1:
2825:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2826:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1n");
2827:     break;
2828:   case 2:
2829:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2830:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2n");
2831:     break;
2832:   case 3:
2833:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2834:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3n");
2835:     break;
2836:   case 4:
2837:     PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);
2838:     if (sse_enabled_local) {
2839: #if defined(PETSC_HAVE_SSE)
2840:       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2841:       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4n");
2842: #else
2843:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2844:       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2845: #endif
2846:     } else {
2847:       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2848:       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4n");
2849:     }
2850:     break;
2851:   case 5:
2852:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2853:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5n");
2854:     break;
2855:   case 6:
2856:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2857:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6n");
2858:     break;
2859:   case 7:
2860:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2861:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7n");
2862:     break;
2863:   }
2864:   return(0);
2865: }

2867: int MatSeqBAIJ_UpdateSolvers(Mat A)
2868: {
2869:   /*
2870:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
2871:       with natural ordering
2872:   */
2873:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2874:   IS          row = a->row, col = a->col;
2875:   PetscTruth  row_identity, col_identity;
2876:   PetscTruth  sse_enabled_local, sse_enabled_global;
2877:   PetscTruth  use_single, use_natural;
2878:   int         ierr;


2882:   use_single  = PETSC_FALSE;
2883:   use_natural = PETSC_FALSE;

2885:   ISIdentity(row,&row_identity);
2886:   ISIdentity(col,&col_identity);

2888:   if (row_identity && col_identity) {
2889:     use_natural = PETSC_TRUE;
2890:   } else {
2891:     use_natural = PETSC_FALSE;
2892:   }
2893:   switch (a->bs) {
2894:   case 1:
2895:     if (use_natural) {
2896:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
2897:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
2898:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1n");
2899:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2900:     } else {
2901:       A->ops->solve           = MatSolve_SeqBAIJ_1;
2902:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
2903:     }
2904:     break;
2905:   case 2:
2906:     if (use_natural) {
2907:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2908:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
2909:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2n");
2910:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2911:     } else {
2912:       A->ops->solve           = MatSolve_SeqBAIJ_2;
2913:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
2914:     }
2915:     break;
2916:   case 3:
2917:     if (use_natural) {
2918:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2919:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
2920:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3n");
2921:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2922:     } else {
2923:       A->ops->solve           = MatSolve_SeqBAIJ_3;
2924:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
2925:     }
2926:     break;
2927:   case 4:
2928:     PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);
2929:     if (sse_enabled_local) {
2930:       PetscTruth single_prec,flg;
2931:       single_prec = flg = PETSC_FALSE;
2932:       PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);
2933:       if (flg) {
2934:         a->single_precision_solves = single_prec;
2935:       }
2936:       if (a->single_precision_solves) {
2937:         use_single = PETSC_TRUE;
2938:       }
2939:     } else {
2940:       use_single = PETSC_FALSE;
2941:     }
2942:     if (use_natural) {
2943:       if (use_single) {
2944: #if defined(PETSC_HAVE_SSE)
2945:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
2946:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4n");
2947: #else
2948:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2949:       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2950: #endif
2951:       } else {
2952:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
2953:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2954:       }
2955:       A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
2956:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
2957:     } else {
2958:       if (use_single) {
2959: #if defined(PETSC_HAVE_SSE)
2960:         A->ops->solve           = MatSolve_SeqBAIJ_4_SSE_Demotion;
2961:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4n");
2962: #else
2963:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2964:       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2965: #endif
2966:       } else {
2967:         A->ops->solve           = MatSolve_SeqBAIJ_4;
2968:       }
2969:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
2970:     }
2971:     break;
2972:   case 5:
2973:     if (use_natural) {
2974:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
2975:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
2976:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5n");
2977:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5n");
2978:     } else {
2979:       A->ops->solve           = MatSolve_SeqBAIJ_5;
2980:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
2981:     }
2982:     break;
2983:   case 6:
2984:     if (use_natural) {
2985:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
2986:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
2987:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6n");
2988:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6n");
2989:     } else {
2990:       A->ops->solve           = MatSolve_SeqBAIJ_6;
2991:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
2992:     }
2993:     break;
2994:   case 7:
2995:     if (use_natural) {
2996:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
2997:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
2998:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7n");
2999:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7n");
3000:     } else {
3001:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3002:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3003:     }
3004:     break;
3005:   default:
3006:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3007:     break;
3008:   }
3009:   return(0);
3010: }

3012: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3016:   MatSeqBAIJ_UpdateSolvers(A);
3017:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3018:     (*A->ops->solve)(A,x,y);
3019:   } else {
3020:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3021:   }
3022:   return(0);
3023: }

3025: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3029:   MatSeqBAIJ_UpdateSolvers(A);
3030:   (*A->ops->solvetranspose)(A,x,y);
3031:   return(0);
3032: }