Actual source code: sbaijfact2.c

  1: /*$Id: sbaijfact2.c,v 1.2.1.41 2001/08/07 03:03:01 balay Exp $*/
  2: /*
  3:     Factorization code for SBAIJ format. 
  4: */

 6:  #include src/mat/impls/sbaij/seq/sbaij.h
 7:  #include src/mat/impls/baij/seq/baij.h
 8:  #include src/vec/vecimpl.h
 9:  #include src/inline/ilu.h
 10:  #include src/inline/dot.h

 12: int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 13: {
 15:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 16: }

 18: int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 19: {
 21:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 22: }

 24: int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
 25: {
 27:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 28: }

 30: int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
 31: {
 33:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 34: }

 36: int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
 37: {
 39:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 40: }

 42: int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
 43: {
 45:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 46: }

 48: int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
 49: {
 51:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 52: }

 54: int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
 55: {
 57:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 58: }

 60: int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
 61: {
 63:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 64: }

 66: int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
 67: {
 69:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 70: }

 72: int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
 73: {
 75:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 76: }

 78: int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
 79: {
 81:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 82: }

 84: int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
 85: {
 87:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 88: }

 90: int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
 91: {
 93:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 94: }

 96: int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
 97: {
 98:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
 99:   IS              isrow=a->row;
100:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
101:   int             nz,*vj,k,ierr,*r,idx,k1;
102:   int             bs=a->bs,bs2 = a->bs2;
103:   MatScalar       *aa=a->a,*v,*diag;
104:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

107:   VecGetArray(bb,&b);
108:   VecGetArray(xx,&x);
109:   t  = a->solve_work;
110:   ISGetIndices(isrow,&r);
111:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

113:   /* solve U^T * D * y = b by forward substitution */
114:   xk = t;
115:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
116:     idx   = bs*r[k];
117:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
118:   }
119:   for (k=0; k<mbs; k++){
120:     v  = aa + bs2*ai[k];
121:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
122:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
123:     nz = ai[k+1] - ai[k];
124:     vj = aj + ai[k];
125:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
126:     while (nz--) {
127:       /* x(:) += U(k,:)^T*(Dk*xk) */
128:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
129:       vj++; xj = t + (*vj)*bs;
130:       v += bs2;
131:     }
132:     /* xk = inv(Dk)*(Dk*xk) */
133:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
134:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
135:   }

137:   /* solve U*x = y by back substitution */
138:   for (k=mbs-1; k>=0; k--){
139:     v  = aa + bs2*ai[k];
140:     xk = t + k*bs;        /* xk */
141:     nz = ai[k+1] - ai[k];
142:     vj = aj + ai[k];
143:     xj = t + (*vj)*bs;
144:     while (nz--) {
145:       /* xk += U(k,:)*x(:) */
146:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
147:       vj++;
148:       v += bs2; xj = t + (*vj)*bs;
149:     }
150:     idx   = bs*r[k];
151:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
152:   }

154:   PetscFree(xk_tmp);
155:   VecRestoreArray(bb,&b);
156:   VecRestoreArray(xx,&x);
157:   PetscLogFlops(bs2*(2*a->s_nz + mbs));
158:   return(0);
159: }

161: int MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
162: {
163:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
164:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
165:   int             nz,*vj,k,ierr;
166:   int             bs=a->bs,bs2 = a->bs2;
167:   MatScalar       *aa=a->a,*v,*diag;
168:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp;

171: 
172:   VecGetArray(bb,&b);
173:   VecGetArray(xx,&x);

175:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

177:   /* solve U^T * D * y = b by forward substitution */
178:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
179:   for (k=0; k<mbs; k++){
180:     v  = aa + bs2*ai[k];
181:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
182:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
183:     nz = ai[k+1] - ai[k];
184:     vj = aj + ai[k];
185:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
186:     while (nz--) {
187:       /* x(:) += U(k,:)^T*(Dk*xk) */
188:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
189:       vj++; xj = x + (*vj)*bs;
190:       v += bs2;
191:     }
192:     /* xk = inv(Dk)*(Dk*xk) */
193:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
194:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
195:   }

197:   /* solve U*x = y by back substitution */
198:   for (k=mbs-1; k>=0; k--){
199:     v  = aa + bs2*ai[k];
200:     xk = x + k*bs;        /* xk */
201:     nz = ai[k+1] - ai[k];
202:     vj = aj + ai[k];
203:     xj = x + (*vj)*bs;
204:     while (nz--) {
205:       /* xk += U(k,:)*x(:) */
206:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
207:       vj++;
208:       v += bs2; xj = x + (*vj)*bs;
209:     }
210:   }

212:   PetscFree(xk_tmp);
213:   VecRestoreArray(bb,&b);
214:   VecRestoreArray(xx,&x);
215:   PetscLogFlops(bs2*(2*a->s_nz + mbs));
216:   return(0);
217: }

219: int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
220: {
221:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
222:   IS              isrow=a->row;
223:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
224:   int             nz,*vj,k,ierr,*r,idx;
225:   MatScalar       *aa=a->a,*v,*d;
226:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

229:   VecGetArray(bb,&b);
230:   VecGetArray(xx,&x);
231:   t  = a->solve_work;
232:   ISGetIndices(isrow,&r);

234:   /* solve U^T * D * y = b by forward substitution */
235:   tp = t;
236:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
237:     idx   = 7*r[k];
238:     tp[0] = b[idx];
239:     tp[1] = b[idx+1];
240:     tp[2] = b[idx+2];
241:     tp[3] = b[idx+3];
242:     tp[4] = b[idx+4];
243:     tp[5] = b[idx+5];
244:     tp[6] = b[idx+6];
245:     tp += 7;
246:   }
247: 
248:   for (k=0; k<mbs; k++){
249:     v  = aa + 49*ai[k];
250:     vj = aj + ai[k];
251:     tp = t + k*7;
252:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
253:     nz = ai[k+1] - ai[k];
254:     tp = t + (*vj)*7;
255:     while (nz--) {
256:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
257:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
258:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
259:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
260:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
261:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
262:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
263:       vj++; tp = t + (*vj)*7;
264:       v += 49;
265:     }

267:     /* xk = inv(Dk)*(Dk*xk) */
268:     d  = aa+k*49;          /* ptr to inv(Dk) */
269:     tp    = t + k*7;
270:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
271:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
272:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
273:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
274:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
275:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
276:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
277:   }

279:   /* solve U*x = y by back substitution */
280:   for (k=mbs-1; k>=0; k--){
281:     v  = aa + 49*ai[k];
282:     vj = aj + ai[k];
283:     tp    = t + k*7;
284:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
285:     nz = ai[k+1] - ai[k];
286: 
287:     tp = t + (*vj)*7;
288:     while (nz--) {
289:       /* xk += U(k,:)*x(:) */
290:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
291:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
292:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
293:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
294:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
295:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
296:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
297:       vj++; tp = t + (*vj)*7;
298:       v += 49;
299:     }
300:     tp    = t + k*7;
301:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302:     idx   = 7*r[k];
303:     x[idx]     = x0;
304:     x[idx+1]   = x1;
305:     x[idx+2]   = x2;
306:     x[idx+3]   = x3;
307:     x[idx+4]   = x4;
308:     x[idx+5]   = x5;
309:     x[idx+6]   = x6;
310:   }

312:   VecRestoreArray(bb,&b);
313:   VecRestoreArray(xx,&x);
314:   PetscLogFlops(49*(2*a->s_nz + mbs));
315:   return(0);
316: }

318: int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
319: {
320:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
321:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
322:   MatScalar       *aa=a->a,*v,*d;
323:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
324:   int             nz,*vj,k,ierr;

327: 
328:   VecGetArray(bb,&b);
329:   VecGetArray(xx,&x);
330: 
331:   /* solve U^T * D * y = b by forward substitution */
332:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
333:   for (k=0; k<mbs; k++){
334:     v  = aa + 49*ai[k];
335:     xp = x + k*7;
336:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
337:     nz = ai[k+1] - ai[k];
338:     vj = aj + ai[k];
339:     xp = x + (*vj)*7;
340:     while (nz--) {
341:       /* x(:) += U(k,:)^T*(Dk*xk) */
342:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
343:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
344:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
345:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
346:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
347:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
348:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
349:       vj++; xp = x + (*vj)*7;
350:       v += 49;
351:     }
352:     /* xk = inv(Dk)*(Dk*xk) */
353:     d  = aa+k*49;          /* ptr to inv(Dk) */
354:     xp = x + k*7;
355:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
356:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
357:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
358:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
359:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
360:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
361:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
362:   }

364:   /* solve U*x = y by back substitution */
365:   for (k=mbs-1; k>=0; k--){
366:     v  = aa + 49*ai[k];
367:     xp = x + k*7;
368:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
369:     nz = ai[k+1] - ai[k];
370:     vj = aj + ai[k];
371:     xp = x + (*vj)*7;
372:     while (nz--) {
373:       /* xk += U(k,:)*x(:) */
374:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
375:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
376:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
377:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
378:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
379:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
380:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
381:       vj++;
382:       v += 49; xp = x + (*vj)*7;
383:     }
384:     xp = x + k*7;
385:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
386:   }

388:   VecRestoreArray(bb,&b);
389:   VecRestoreArray(xx,&x);
390:   PetscLogFlops(49*(2*a->s_nz + mbs));
391:   return(0);
392: }

394: int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
395: {
396:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
397:   IS              isrow=a->row;
398:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
399:   int             nz,*vj,k,ierr,*r,idx;
400:   MatScalar       *aa=a->a,*v,*d;
401:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

404:   VecGetArray(bb,&b);
405:   VecGetArray(xx,&x);
406:   t  = a->solve_work;
407:   ISGetIndices(isrow,&r);

409:   /* solve U^T * D * y = b by forward substitution */
410:   tp = t;
411:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
412:     idx   = 6*r[k];
413:     tp[0] = b[idx];
414:     tp[1] = b[idx+1];
415:     tp[2] = b[idx+2];
416:     tp[3] = b[idx+3];
417:     tp[4] = b[idx+4];
418:     tp[5] = b[idx+5];
419:     tp += 6;
420:   }
421: 
422:   for (k=0; k<mbs; k++){
423:     v  = aa + 36*ai[k];
424:     vj = aj + ai[k];
425:     tp = t + k*6;
426:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
427:     nz = ai[k+1] - ai[k];
428:     tp = t + (*vj)*6;
429:     while (nz--) {
430:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
431:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
432:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
433:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
434:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
435:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
436:       vj++; tp = t + (*vj)*6;
437:       v += 36;
438:     }

440:     /* xk = inv(Dk)*(Dk*xk) */
441:     d  = aa+k*36;          /* ptr to inv(Dk) */
442:     tp    = t + k*6;
443:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
444:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
445:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
446:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
447:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
448:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
449:   }

451:   /* solve U*x = y by back substitution */
452:   for (k=mbs-1; k>=0; k--){
453:     v  = aa + 36*ai[k];
454:     vj = aj + ai[k];
455:     tp    = t + k*6;
456:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
457:     nz = ai[k+1] - ai[k];
458: 
459:     tp = t + (*vj)*6;
460:     while (nz--) {
461:       /* xk += U(k,:)*x(:) */
462:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
463:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
464:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
465:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
466:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
467:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
468:       vj++; tp = t + (*vj)*6;
469:       v += 36;
470:     }
471:     tp    = t + k*6;
472:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
473:     idx   = 6*r[k];
474:     x[idx]     = x0;
475:     x[idx+1]   = x1;
476:     x[idx+2]   = x2;
477:     x[idx+3]   = x3;
478:     x[idx+4]   = x4;
479:     x[idx+5]   = x5;
480:   }

482:   VecRestoreArray(bb,&b);
483:   VecRestoreArray(xx,&x);
484:   PetscLogFlops(36*(2*a->s_nz + mbs));
485:   return(0);
486: }

488: int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
489: {
490:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
491:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
492:   MatScalar       *aa=a->a,*v,*d;
493:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5;
494:   int             nz,*vj,k,ierr;

497: 
498:   VecGetArray(bb,&b);
499:   VecGetArray(xx,&x);
500: 
501:   /* solve U^T * D * y = b by forward substitution */
502:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
503:   for (k=0; k<mbs; k++){
504:     v  = aa + 36*ai[k];
505:     xp = x + k*6;
506:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
507:     nz = ai[k+1] - ai[k];
508:     vj = aj + ai[k];
509:     xp = x + (*vj)*6;
510:     while (nz--) {
511:       /* x(:) += U(k,:)^T*(Dk*xk) */
512:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
513:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
514:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
515:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
516:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
517:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
518:       vj++; xp = x + (*vj)*6;
519:       v += 36;
520:     }
521:     /* xk = inv(Dk)*(Dk*xk) */
522:     d  = aa+k*36;          /* ptr to inv(Dk) */
523:     xp = x + k*6;
524:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
525:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
526:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
527:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
528:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
529:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
530:   }

532:   /* solve U*x = y by back substitution */
533:   for (k=mbs-1; k>=0; k--){
534:     v  = aa + 36*ai[k];
535:     xp = x + k*6;
536:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
537:     nz = ai[k+1] - ai[k];
538:     vj = aj + ai[k];
539:     xp = x + (*vj)*6;
540:     while (nz--) {
541:       /* xk += U(k,:)*x(:) */
542:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
543:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
544:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
545:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
546:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
547:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
548:       vj++;
549:       v += 36; xp = x + (*vj)*6;
550:     }
551:     xp = x + k*6;
552:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
553:   }

555:   VecRestoreArray(bb,&b);
556:   VecRestoreArray(xx,&x);
557:   PetscLogFlops(36*(2*a->s_nz + mbs));
558:   return(0);
559: }

561: int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
562: {
563:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
564:   IS              isrow=a->row;
565:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
566:   int             nz,*vj,k,ierr,*r,idx;
567:   MatScalar       *aa=a->a,*v,*diag;
568:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,*t,*tp;

571:   VecGetArray(bb,&b);
572:   VecGetArray(xx,&x);
573:   t  = a->solve_work;
574:   ISGetIndices(isrow,&r);

576:   /* solve U^T * D * y = b by forward substitution */
577:   tp = t;
578:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
579:     idx   = 5*r[k];
580:     tp[0] = b[idx];
581:     tp[1] = b[idx+1];
582:     tp[2] = b[idx+2];
583:     tp[3] = b[idx+3];
584:     tp[4] = b[idx+4];
585:     tp += 5;
586:   }
587: 
588:   for (k=0; k<mbs; k++){
589:     v  = aa + 25*ai[k];
590:     vj = aj + ai[k];
591:     tp = t + k*5;
592:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
593:     nz = ai[k+1] - ai[k];

595:     tp = t + (*vj)*5;
596:     while (nz--) {
597:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
598:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
599:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
600:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
601:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
602:       vj++; tp = t + (*vj)*5;
603:       v += 25;
604:     }

606:     /* xk = inv(Dk)*(Dk*xk) */
607:     diag  = aa+k*25;          /* ptr to inv(Dk) */
608:     tp    = t + k*5;
609:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
610:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
611:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
612:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
613:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
614:   }

616:   /* solve U*x = y by back substitution */
617:   for (k=mbs-1; k>=0; k--){
618:     v  = aa + 25*ai[k];
619:     vj = aj + ai[k];
620:     tp    = t + k*5;
621:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
622:     nz = ai[k+1] - ai[k];
623: 
624:     tp = t + (*vj)*5;
625:     while (nz--) {
626:       /* xk += U(k,:)*x(:) */
627:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
628:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
629:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
630:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
631:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
632:       vj++; tp = t + (*vj)*5;
633:       v += 25;
634:     }
635:     tp    = t + k*5;
636:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
637:     idx   = 5*r[k];
638:     x[idx]     = x0;
639:     x[idx+1]   = x1;
640:     x[idx+2]   = x2;
641:     x[idx+3]   = x3;
642:     x[idx+4]   = x4;
643:   }

645:   VecRestoreArray(bb,&b);
646:   VecRestoreArray(xx,&x);
647:   PetscLogFlops(25*(2*a->s_nz + mbs));
648:   return(0);
649: }

651: int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
652: {
653:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
654:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
655:   MatScalar       *aa=a->a,*v,*diag;
656:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4;
657:   int             nz,*vj,k,ierr;

660: 
661:   VecGetArray(bb,&b);
662:   VecGetArray(xx,&x);

664:   /* solve U^T * D * y = b by forward substitution */
665:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
666:   for (k=0; k<mbs; k++){
667:     v  = aa + 25*ai[k];
668:     xp = x + k*5;
669:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
670:     nz = ai[k+1] - ai[k];
671:     vj = aj + ai[k];
672:     xp = x + (*vj)*5;
673:     while (nz--) {
674:       /* x(:) += U(k,:)^T*(Dk*xk) */
675:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
676:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
677:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
678:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
679:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
680:       vj++; xp = x + (*vj)*5;
681:       v += 25;
682:     }
683:     /* xk = inv(Dk)*(Dk*xk) */
684:     diag = aa+k*25;          /* ptr to inv(Dk) */
685:     xp   = x + k*5;
686:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
687:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
688:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
689:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
690:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
691:   }

693:   /* solve U*x = y by back substitution */
694:   for (k=mbs-1; k>=0; k--){
695:     v  = aa + 25*ai[k];
696:     xp = x + k*5;
697:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
698:     nz = ai[k+1] - ai[k];
699:     vj = aj + ai[k];
700:     xp = x + (*vj)*5;
701:     while (nz--) {
702:       /* xk += U(k,:)*x(:) */
703:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
704:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
705:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
706:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
707:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
708:       vj++;
709:       v += 25; xp = x + (*vj)*5;
710:     }
711:     xp = x + k*5;
712:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
713:   }

715:   VecRestoreArray(bb,&b);
716:   VecRestoreArray(xx,&x);
717:   PetscLogFlops(25*(2*a->s_nz + mbs));
718:   return(0);
719: }

721: int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
722: {
723:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
724:   IS              isrow=a->row;
725:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
726:   int             nz,*vj,k,ierr,*r,idx;
727:   MatScalar       *aa=a->a,*v,*diag;
728:   PetscScalar     *x,*b,x0,x1,x2,x3,*t,*tp;

731:   VecGetArray(bb,&b);
732:   VecGetArray(xx,&x);
733:   t  = a->solve_work;
734:   ISGetIndices(isrow,&r);

736:   /* solve U^T * D * y = b by forward substitution */
737:   tp = t;
738:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
739:     idx   = 4*r[k];
740:     tp[0] = b[idx];
741:     tp[1] = b[idx+1];
742:     tp[2] = b[idx+2];
743:     tp[3] = b[idx+3];
744:     tp += 4;
745:   }
746: 
747:   for (k=0; k<mbs; k++){
748:     v  = aa + 16*ai[k];
749:     vj = aj + ai[k];
750:     tp = t + k*4;
751:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
752:     nz = ai[k+1] - ai[k];

754:     tp = t + (*vj)*4;
755:     while (nz--) {
756:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
757:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
758:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
759:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
760:       vj++; tp = t + (*vj)*4;
761:       v += 16;
762:     }

764:     /* xk = inv(Dk)*(Dk*xk) */
765:     diag  = aa+k*16;          /* ptr to inv(Dk) */
766:     tp    = t + k*4;
767:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
768:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
769:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
770:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
771:   }

773:   /* solve U*x = y by back substitution */
774:   for (k=mbs-1; k>=0; k--){
775:     v  = aa + 16*ai[k];
776:     vj = aj + ai[k];
777:     tp    = t + k*4;
778:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
779:     nz = ai[k+1] - ai[k];
780: 
781:     tp = t + (*vj)*4;
782:     while (nz--) {
783:       /* xk += U(k,:)*x(:) */
784:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
785:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
786:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
787:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
788:       vj++; tp = t + (*vj)*4;
789:       v += 16;
790:     }
791:     tp    = t + k*4;
792:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
793:     idx        = 4*r[k];
794:     x[idx]     = x0;
795:     x[idx+1]   = x1;
796:     x[idx+2]   = x2;
797:     x[idx+3]   = x3;
798:   }

800:   VecRestoreArray(bb,&b);
801:   VecRestoreArray(xx,&x);
802:   PetscLogFlops(16*(2*a->s_nz + mbs));
803:   return(0);
804: }

806: /*
807:    Special case where the matrix was factored in the natural ordering. 
808:    This eliminates the need for the column and row permutation.
809: */
810: int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
811: {
812:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
813:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
814:   MatScalar       *aa=a->a,*v,*diag;
815:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3;
816:   int             nz,*vj,k,ierr;

819: 
820:   VecGetArray(bb,&b);
821:   VecGetArray(xx,&x);

823:   /* solve U^T * D * y = b by forward substitution */
824:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
825:   for (k=0; k<mbs; k++){
826:     v  = aa + 16*ai[k];
827:     xp = x + k*4;
828:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
829:     nz = ai[k+1] - ai[k];
830:     vj = aj + ai[k];
831:     xp = x + (*vj)*4;
832:     while (nz--) {
833:       /* x(:) += U(k,:)^T*(Dk*xk) */
834:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
835:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
836:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
837:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
838:       vj++; xp = x + (*vj)*4;
839:       v += 16;
840:     }
841:     /* xk = inv(Dk)*(Dk*xk) */
842:     diag = aa+k*16;          /* ptr to inv(Dk) */
843:     xp   = x + k*4;
844:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
845:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
846:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
847:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
848:   }

850:   /* solve U*x = y by back substitution */
851:   for (k=mbs-1; k>=0; k--){
852:     v  = aa + 16*ai[k];
853:     xp = x + k*4;
854:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
855:     nz = ai[k+1] - ai[k];
856:     vj = aj + ai[k];
857:     xp = x + (*vj)*4;
858:     while (nz--) {
859:       /* xk += U(k,:)*x(:) */
860:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
861:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
862:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
863:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
864:       vj++;
865:       v += 16; xp = x + (*vj)*4;
866:     }
867:     xp = x + k*4;
868:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
869:   }

871:   VecRestoreArray(bb,&b);
872:   VecRestoreArray(xx,&x);
873:   PetscLogFlops(16*(2*a->s_nz + mbs));
874:   return(0);
875: }

877: int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
878: {
879:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
880:   IS              isrow=a->row;
881:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
882:   int             nz,*vj,k,ierr,*r,idx;
883:   MatScalar       *aa=a->a,*v,*diag;
884:   PetscScalar     *x,*b,x0,x1,x2,*t,*tp;

887:   VecGetArray(bb,&b);
888:   VecGetArray(xx,&x);
889:   t  = a->solve_work;
890:   ISGetIndices(isrow,&r);

892:   /* solve U^T * D * y = b by forward substitution */
893:   tp = t;
894:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
895:     idx   = 3*r[k];
896:     tp[0] = b[idx];
897:     tp[1] = b[idx+1];
898:     tp[2] = b[idx+2];
899:     tp += 3;
900:   }
901: 
902:   for (k=0; k<mbs; k++){
903:     v  = aa + 9*ai[k];
904:     vj = aj + ai[k];
905:     tp = t + k*3;
906:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
907:     nz = ai[k+1] - ai[k];

909:     tp = t + (*vj)*3;
910:     while (nz--) {
911:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
912:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
913:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
914:       vj++; tp = t + (*vj)*3;
915:       v += 9;
916:     }

918:     /* xk = inv(Dk)*(Dk*xk) */
919:     diag  = aa+k*9;          /* ptr to inv(Dk) */
920:     tp    = t + k*3;
921:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
922:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
923:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
924:   }

926:   /* solve U*x = y by back substitution */
927:   for (k=mbs-1; k>=0; k--){
928:     v  = aa + 9*ai[k];
929:     vj = aj + ai[k];
930:     tp    = t + k*3;
931:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
932:     nz = ai[k+1] - ai[k];
933: 
934:     tp = t + (*vj)*3;
935:     while (nz--) {
936:       /* xk += U(k,:)*x(:) */
937:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
938:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
939:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
940:       vj++; tp = t + (*vj)*3;
941:       v += 9;
942:     }
943:     tp    = t + k*3;
944:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
945:     idx      = 3*r[k];
946:     x[idx]   = x0;
947:     x[idx+1] = x1;
948:     x[idx+2] = x2;
949:   }

951:   VecRestoreArray(bb,&b);
952:   VecRestoreArray(xx,&x);
953:   PetscLogFlops(9*(2*a->s_nz + mbs));
954:   return(0);
955: }

957: /*
958:    Special case where the matrix was factored in the natural ordering. 
959:    This eliminates the need for the column and row permutation.
960: */
961: int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
962: {
963:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
964:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
965:   MatScalar       *aa=a->a,*v,*diag;
966:   PetscScalar     *x,*xp,*b,x0,x1,x2;
967:   int             nz,*vj,k,ierr;

970: 
971:   VecGetArray(bb,&b);
972:   VecGetArray(xx,&x);

974:   /* solve U^T * D * y = b by forward substitution */
975:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
976:   for (k=0; k<mbs; k++){
977:     v  = aa + 9*ai[k];
978:     xp = x + k*3;
979:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
980:     nz = ai[k+1] - ai[k];
981:     vj = aj + ai[k];
982:     xp = x + (*vj)*3;
983:     while (nz--) {
984:       /* x(:) += U(k,:)^T*(Dk*xk) */
985:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
986:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
987:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
988:       vj++; xp = x + (*vj)*3;
989:       v += 9;
990:     }
991:     /* xk = inv(Dk)*(Dk*xk) */
992:     diag = aa+k*9;          /* ptr to inv(Dk) */
993:     xp   = x + k*3;
994:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
995:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
996:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
997:   }

999:   /* solve U*x = y by back substitution */
1000:   for (k=mbs-1; k>=0; k--){
1001:     v  = aa + 9*ai[k];
1002:     xp = x + k*3;
1003:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1004:     nz = ai[k+1] - ai[k];
1005:     vj = aj + ai[k];
1006:     xp = x + (*vj)*3;
1007:     while (nz--) {
1008:       /* xk += U(k,:)*x(:) */
1009:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1010:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1011:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1012:       vj++;
1013:       v += 9; xp = x + (*vj)*3;
1014:     }
1015:     xp = x + k*3;
1016:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1017:   }

1019:   VecRestoreArray(bb,&b);
1020:   VecRestoreArray(xx,&x);
1021:   PetscLogFlops(9*(2*a->s_nz + mbs));
1022:   return(0);
1023: }

1025: int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1026: {
1027:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ *)A->data;
1028:   IS              isrow=a->row;
1029:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1030:   int             nz,*vj,k,k2,ierr,*r,idx;
1031:   MatScalar       *aa=a->a,*v,*diag;
1032:   PetscScalar     *x,*b,x0,x1,*t;

1035:   VecGetArray(bb,&b);
1036:   VecGetArray(xx,&x);
1037:   t  = a->solve_work;
1038:   /* printf("called MatSolve_SeqSBAIJ_2n"); */
1039:   ISGetIndices(isrow,&r);

1041:   /* solve U^T * D * y = perm(b) by forward substitution */
1042:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1043:     idx = 2*r[k];
1044:     t[k*2]   = b[idx];
1045:     t[k*2+1] = b[idx+1];
1046:   }
1047:   for (k=0; k<mbs; k++){
1048:     v  = aa + 4*ai[k];
1049:     vj = aj + ai[k];
1050:     k2 = k*2;
1051:     x0 = t[k2]; x1 = t[k2+1];
1052:     nz = ai[k+1] - ai[k];
1053:     while (nz--) {
1054:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1055:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1056:       vj++; v += 4;
1057:     }
1058:     diag = aa+k*4;  /* ptr to inv(Dk) */
1059:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1060:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1061:   }

1063:   /* solve U*x = y by back substitution */
1064:   for (k=mbs-1; k>=0; k--){
1065:     v  = aa + 4*ai[k];
1066:     vj = aj + ai[k];
1067:     k2 = k*2;
1068:     x0 = t[k2]; x1 = t[k2+1];
1069:     nz = ai[k+1] - ai[k];
1070:     while (nz--) {
1071:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1072:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1073:       vj++; v += 4;
1074:     }
1075:     t[k2]    = x0;
1076:     t[k2+1]  = x1;
1077:     idx      = 2*r[k];
1078:     x[idx]   = x0;
1079:     x[idx+1] = x1;
1080:   }

1082:   ISRestoreIndices(isrow,&r);
1083:   VecRestoreArray(bb,&b);
1084:   VecRestoreArray(xx,&x);
1085:   PetscLogFlops(4*(2*a->s_nz + mbs));
1086:   return(0);
1087: }

1089: /*
1090:    Special case where the matrix was factored in the natural ordering. 
1091:    This eliminates the need for the column and row permutation.
1092: */
1093: int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1094: {
1095:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1096:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1097:   MatScalar       *aa=a->a,*v,*diag;
1098:   PetscScalar     *x,*b,x0,x1;
1099:   int             nz,*vj,k,k2,ierr;

1102: 
1103:   VecGetArray(bb,&b);
1104:   VecGetArray(xx,&x);

1106:   /* solve U^T * D * y = b by forward substitution */
1107:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1108:   for (k=0; k<mbs; k++){
1109:     v  = aa + 4*ai[k];
1110:     vj = aj + ai[k];
1111:     k2 = k*2;
1112:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1113:     nz = ai[k+1] - ai[k];
1114: 
1115:     while (nz--) {
1116:       /* x(:) += U(k,:)^T*(Dk*xk) */
1117:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1118:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1119:       vj++; v += 4;
1120:     }
1121:     /* xk = inv(Dk)*(Dk*xk) */
1122:     diag = aa+k*4;          /* ptr to inv(Dk) */
1123:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1124:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1125:   }

1127:   /* solve U*x = y by back substitution */
1128:   for (k=mbs-1; k>=0; k--){
1129:     v  = aa + 4*ai[k];
1130:     vj = aj + ai[k];
1131:     k2 = k*2;
1132:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1133:     nz = ai[k+1] - ai[k];
1134:     while (nz--) {
1135:       /* xk += U(k,:)*x(:) */
1136:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1137:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1138:       vj++; v += 4;
1139:     }
1140:     x[k2]     = x0;
1141:     x[k2+1]   = x1;
1142:   }

1144:   VecRestoreArray(bb,&b);
1145:   VecRestoreArray(xx,&x);
1146:   PetscLogFlops(4*(2*a->s_nz + mbs)); /* bs2*(2*a->s_nz + mbs) */
1147:   return(0);
1148: }

1150: int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1151: {
1152:   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
1153:   IS              isrow=a->row;
1154:   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
1155:   MatScalar       *aa=a->a,*v;
1156:   PetscScalar     *x,*b,xk,*t;
1157:   int             nz,*vj,k;

1160:   if (!mbs) return(0);

1162:   VecGetArray(bb,&b);
1163:   VecGetArray(xx,&x);
1164:   t = a->solve_work;

1166:   ISGetIndices(isrow,&rip);
1167: 
1168:   /* solve U^T*D*y = perm(b) by forward substitution */
1169:   for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1170:   for (k=0; k<mbs; k++){
1171:     v  = aa + ai[k];
1172:     vj = aj + ai[k];
1173:     xk = t[k];
1174:     nz = ai[k+1] - ai[k];
1175:     while (nz--) t[*vj++] += (*v++) * xk;
1176:     t[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1177:   }

1179:   /* solve U*x = y by back substitution */
1180:   for (k=mbs-1; k>=0; k--){
1181:     v  = aa + ai[k];
1182:     vj = aj + ai[k];
1183:     xk = t[k];
1184:     nz = ai[k+1] - ai[k];
1185:     while (nz--) xk += (*v++) * t[*vj++];
1186:     t[k] = xk;
1187:     x[rip[k]] = xk;
1188:   }

1190:   ISRestoreIndices(isrow,&rip);
1191:   VecRestoreArray(bb,&b);
1192:   VecRestoreArray(xx,&x);
1193:   PetscLogFlops(4*a->s_nz + A->m);
1194:   return(0);
1195: }

1197: /*
1198:       Special case where the matrix was ILU(0) factored in the natural
1199:    ordering. This eliminates the need for the column and row permutation.
1200: */
1201: int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1202: {
1203:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1204:   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
1205:   MatScalar       *aa=a->a,*v;
1206:   PetscScalar     *x,*b,xk;
1207:   int             nz,*vj,k;

1210: 
1211:   VecGetArray(bb,&b);
1212:   VecGetArray(xx,&x);
1213: 
1214:   /* solve U^T*D*y = b by forward substitution */
1215:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1216:   for (k=0; k<mbs; k++){
1217:     v  = aa + ai[k];
1218:     vj = aj + ai[k];
1219:     xk = x[k];
1220:     nz = ai[k+1] - ai[k];
1221:     while (nz--) x[*vj++] += (*v++) * xk;
1222:     x[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1223:   }

1225:   /* solve U*x = y by back substitution */
1226:   for (k=mbs-2; k>=0; k--){
1227:     v  = aa + ai[k];
1228:     vj = aj + ai[k];
1229:     xk = x[k];
1230:     nz = ai[k+1] - ai[k];
1231:     while (nz--) xk += (*v++) * x[*vj++];
1232:     x[k] = xk;
1233:   }

1235:   VecRestoreArray(bb,&b);
1236:   VecRestoreArray(xx,&x);
1237:   PetscLogFlops(4*a->s_nz + A->m);
1238:   return(0);
1239: }

1241: int MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,int levels,Mat *B)
1242: {
1243:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1244:   int         *rip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1245:   int         *jutmp,bs = a->bs,bs2=a->bs2;
1246:   int         m,nzi,realloc = 0,*levtmp;
1247:   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
1248:   int         incrlev,*lev,lev_ik,shift;
1249:   PetscTruth  perm_identity;

1252: 
1253:   /* check whether perm is the identity mapping */
1254:   ISIdentity(perm,&perm_identity);
1255:   if (!perm_identity) a->permute = PETSC_TRUE;
1256:   if (perm_identity){
1257:     ai = a->i; aj = a->j;
1258:   } else { /*  non-trivial permutation */
1259:     MatReorderingSeqSBAIJ(A, perm);
1260:     ai = a->inew; aj = a->jnew;
1261:   }
1262: 
1263:   /* initialization */
1264:   /* Don't know how many column pointers are needed so estimate. 
1265:      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
1266:   ierr  = ISGetIndices(perm,&rip);
1267:   umax  = (int)(f*ai[mbs] + 1);
1268:   ierr  = PetscMalloc(umax*sizeof(int),&lev);
1269:   umax += mbs + 1;
1270:   shift = mbs + 1;
1271:   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);
1272:   ierr  = PetscMalloc(umax*sizeof(int),&ju);
1273:   iu[0] = mbs+1;
1274:   juptr = mbs;
1275:   ierr  = PetscMalloc(mbs*sizeof(int),&jl);
1276:   ierr  = PetscMalloc(mbs*sizeof(int),&q);
1277:   ierr  = PetscMalloc((mbs+1)*sizeof(int),&levtmp);
1278:   for (i=0; i<mbs; i++){
1279:     jl[i] = mbs; q[i] = 0;
1280:   }

1282:   /* for each row k */
1283:   for (k=0; k<mbs; k++){
1284:     nzk = 0;
1285:     q[k] = mbs;
1286:     /* initialize nonzero structure of k-th row to row rip[k] of A */
1287:     jmin = ai[rip[k]];
1288:     jmax = ai[rip[k]+1];
1289:     for (j=jmin; j<jmax; j++){
1290:       vj = rip[aj[j]];
1291:       if (vj > k){
1292:         qm = k;
1293:         do {
1294:           m  = qm; qm = q[m];
1295:         } while(qm < vj);
1296:         if (qm == vj) {
1297:           printf(" error: duplicate entry in An"); break;
1298:         }
1299:         nzk++;
1300:         q[m]   = vj;
1301:         q[vj]  = qm;
1302:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1303:       }
1304:     }

1306:     /* modify nonzero structure of k-th row by computing fill-in
1307:        for each row i to be merged in */
1308:     i = k;
1309:     i = jl[i]; /* next pivot row (== 0 for symbolic factorization) */
1310: 
1311:     while (i < mbs){
1312:       /* merge row i into k-th row */
1313:       j=iu[i];
1314:       lev_ik = lev[j-shift];
1315:       nzi = iu[i+1] - (iu[i]+1);
1316:       jmin = iu[i] + 1; jmax = iu[i] + nzi;
1317:       qm = k;
1318:       for (j=jmin; j<jmax+1; j++){
1319:         vj = ju[j];
1320:         incrlev = lev[j-shift]+lev_ik+1;

1322:         if (incrlev > levels) continue;
1323:         do {
1324:           m = qm; qm = q[m];
1325:         } while (qm < vj);
1326:         if (qm != vj){      /* a fill */
1327:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1328:           levtmp[vj] = incrlev;
1329:         }
1330:         else {              /* already a nonzero element */
1331:           if (levtmp[vj]>incrlev) levtmp[vj] = incrlev;
1332:         }
1333:       }
1334:       i = jl[i]; /* next pivot row */
1335:     }
1336: 
1337:     /* add k to row list for first nonzero element in k-th row */
1338:     if (nzk > 1){
1339:       i = q[k]; /* col value of first nonzero element in k_th row of U */
1340:       jl[k] = jl[i]; jl[i] = k;
1341:     }
1342:     iu[k+1] = iu[k] + nzk;

1344:     /* allocate more space to ju and lev if needed */
1345:     if (iu[k+1] > umax) {
1346:       printf("allocate more space, iu[%d]=%d > umax=%dn",k+1, iu[k+1],umax);
1347:       /* estimate how much additional space we will need */
1348:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1349:       /* just double the memory each time */
1350:       maxadd = umax;
1351:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1352:       umax += maxadd;

1354:       /* allocate a longer ju */
1355:       PetscMalloc(umax*sizeof(int),&jutmp);
1356:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1357:       PetscFree(ju);
1358:       ju   = jutmp;

1360:       ierr     = PetscMalloc(umax*sizeof(int),&jutmp);
1361:       ierr     = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));
1362:       ierr     = PetscFree(lev);
1363:       lev      = jutmp;
1364:       realloc += 2; /* count how many times we realloc */
1365:     }

1367:     /* save nonzero structure of k-th row in ju */
1368:     i=k;
1369:     jumin = juptr + 1; juptr += nzk;
1370:     for (j=jumin; j<juptr+1; j++){
1371:       i      = q[i];
1372:       ju[j]  = i;
1373:       lev[j-shift] = levtmp[i];
1374:     }
1375:   }
1376: 
1377:   if (ai[mbs] != 0) {
1378:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1379:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
1380:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use n",af);
1381:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);n",af);
1382:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.n");
1383:   } else {
1384:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.n");
1385:   }

1387:   ISRestoreIndices(perm,&rip);
1388:   PetscFree(q);
1389:   PetscFree(jl);
1390:   PetscFree(lev);
1391:   PetscFree(levtmp);

1393:   /* put together the new matrix */
1394:   MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
1395:   /* PetscLogObjectParent(*B,iperm); */
1396:   b = (Mat_SeqSBAIJ*)(*B)->data;
1397:   PetscFree(b->imax);
1398:   b->singlemalloc = PETSC_FALSE;
1399:   /* the next line frees the default space generated by the Create() */
1400:   PetscFree(b->a);
1401:   PetscFree(b->ilen);
1402:   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1403:   b->j    = ju;
1404:   b->i    = iu;
1405:   b->diag = 0;
1406:   b->ilen = 0;
1407:   b->imax = 0;
1408: 
1409:   if (b->row) {
1410:     ISDestroy(b->row);
1411:   }
1412:   if (b->icol) {
1413:     ISDestroy(b->icol);
1414:   }
1415:   b->row  = perm;
1416:   b->icol = perm;
1417:   ierr    = PetscObjectReference((PetscObject)perm);
1418:   ierr    = PetscObjectReference((PetscObject)perm);
1419:   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1420:   /* In b structure:  Free imax, ilen, old a, old j.  
1421:      Allocate idnew, solve_work, new a, new j */
1422:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1423:   b->s_maxnz = b->s_nz = iu[mbs];
1424: 
1425:   (*B)->factor                 = FACTOR_CHOLESKY;
1426:   (*B)->info.factor_mallocs    = realloc;
1427:   (*B)->info.fill_ratio_given  = f;
1428:   if (ai[mbs] != 0) {
1429:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1430:   } else {
1431:     (*B)->info.fill_ratio_needed = 0.0;
1432:   }

1434:   if (perm_identity){
1435:     switch (bs) {
1436:       case 1:
1437:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1438:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1439:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1n");
1440:         break;
1441:       case 2:
1442:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1443:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1444:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2n");
1445:         break;
1446:       case 3:
1447:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1448:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1449:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3n");
1450:         break;
1451:       case 4:
1452:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1453:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1454:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4n");
1455:         break;
1456:       case 5:
1457:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1458:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1459:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5n");
1460:         break;
1461:       case 6:
1462:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1463:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1464:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6n");
1465:         break;
1466:       case 7:
1467:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1468:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1469:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7n");
1470:       break;
1471:       default:
1472:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1473:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1474:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7n");
1475:       break;
1476:     }
1477:   }

1479:   return(0);
1480: }