Actual source code: baij2.c

  1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/vec/vecimpl.h
 5:  #include src/inline/spops.h
 6:  #include src/inline/ilu.h
 7:  #include petscbt.h

  9: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
 10: {
 11:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
 12:   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
 13:   int         start,end,*ai,*aj,bs,*nidx2;
 14:   PetscBT     table;

 17:   m     = a->mbs;
 18:   ai    = a->i;
 19:   aj    = a->j;
 20:   bs    = a->bs;

 22:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 24:   PetscBTCreate(m,table);
 25:   PetscMalloc((m+1)*sizeof(int),&nidx);
 26:   PetscMalloc((A->m+1)*sizeof(int),&nidx2);

 28:   for (i=0; i<is_max; i++) {
 29:     /* Initialise the two local arrays */
 30:     isz  = 0;
 31:     PetscBTMemzero(m,table);
 32: 
 33:     /* Extract the indices, assume there can be duplicate entries */
 34:     ISGetIndices(is[i],&idx);
 35:     ISGetLocalSize(is[i],&n);

 37:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 38:     for (j=0; j<n ; ++j){
 39:       ival = idx[j]/bs; /* convert the indices into block indices */
 40:       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 41:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 42:     }
 43:     ISRestoreIndices(is[i],&idx);
 44:     ISDestroy(is[i]);
 45: 
 46:     k = 0;
 47:     for (j=0; j<ov; j++){ /* for each overlap*/
 48:       n = isz;
 49:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 50:         row   = nidx[k];
 51:         start = ai[row];
 52:         end   = ai[row+1];
 53:         for (l = start; l<end ; l++){
 54:           val = aj[l];
 55:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 56:         }
 57:       }
 58:     }
 59:     /* expand the Index Set */
 60:     for (j=0; j<isz; j++) {
 61:       for (k=0; k<bs; k++)
 62:         nidx2[j*bs+k] = nidx[j]*bs+k;
 63:     }
 64:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 65:   }
 66:   PetscBTDestroy(table);
 67:   PetscFree(nidx);
 68:   PetscFree(nidx2);
 69:   return(0);
 70: }

 72: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
 73: {
 74:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*c;
 75:   int          *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
 76:   int          row,mat_i,*mat_j,tcol,*mat_ilen;
 77:   int          *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
 78:   int          *aj = a->j,*ai = a->i;
 79:   MatScalar    *mat_a;
 80:   Mat          C;
 81:   PetscTruth   flag;

 84:   ISSorted(iscol,(PetscTruth*)&i);
 85:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 87:   ISGetIndices(isrow,&irow);
 88:   ISGetIndices(iscol,&icol);
 89:   ISGetLocalSize(isrow,&nrows);
 90:   ISGetLocalSize(iscol,&ncols);

 92:   PetscMalloc((1+oldcols)*sizeof(int),&smap);
 93:   ssmap = smap;
 94:   PetscMalloc((1+nrows)*sizeof(int),&lens);
 95:   ierr  = PetscMemzero(smap,oldcols*sizeof(int));
 96:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 97:   /* determine lens of each row */
 98:   for (i=0; i<nrows; i++) {
 99:     kstart  = ai[irow[i]];
100:     kend    = kstart + a->ilen[irow[i]];
101:     lens[i] = 0;
102:       for (k=kstart; k<kend; k++) {
103:         if (ssmap[aj[k]]) {
104:           lens[i]++;
105:         }
106:       }
107:     }
108:   /* Create and fill new matrix */
109:   if (scall == MAT_REUSE_MATRIX) {
110:     c = (Mat_SeqBAIJ *)((*B)->data);

112:     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
114:     if (flag == PETSC_FALSE) {
115:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
116:     }
117:     PetscMemzero(c->ilen,c->mbs*sizeof(int));
118:     C = *B;
119:   } else {
120:     MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
121:   }
122:   c = (Mat_SeqBAIJ *)(C->data);
123:   for (i=0; i<nrows; i++) {
124:     row    = irow[i];
125:     kstart = ai[row];
126:     kend   = kstart + a->ilen[row];
127:     mat_i  = c->i[i];
128:     mat_j  = c->j + mat_i;
129:     mat_a  = c->a + mat_i*bs2;
130:     mat_ilen = c->ilen + i;
131:     for (k=kstart; k<kend; k++) {
132:       if ((tcol=ssmap[a->j[k]])) {
133:         *mat_j++ = tcol - 1;
134:         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135:         mat_a   += bs2;
136:         (*mat_ilen)++;
137:       }
138:     }
139:   }
140: 
141:   /* Free work space */
142:   ISRestoreIndices(iscol,&icol);
143:   PetscFree(smap);
144:   PetscFree(lens);
145:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
146:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
147: 
148:   ISRestoreIndices(isrow,&irow);
149:   *B = C;
150:   return(0);
151: }

153: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
154: {
155:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
156:   IS          is1,is2;
157:   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;

160:   ISGetIndices(isrow,&irow);
161:   ISGetIndices(iscol,&icol);
162:   ISGetLocalSize(isrow,&nrows);
163:   ISGetLocalSize(iscol,&ncols);
164: 
165:   /* Verify if the indices corespond to each element in a block 
166:    and form the IS with compressed IS */
167:   PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
168:   iary = vary + a->mbs;
169:   PetscMemzero(vary,(a->mbs)*sizeof(int));
170:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
171:   count = 0;
172:   for (i=0; i<a->mbs; i++) {
173:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
174:     if (vary[i]==bs) iary[count++] = i;
175:   }
176:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
177: 
178:   PetscMemzero(vary,(a->mbs)*sizeof(int));
179:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
180:   count = 0;
181:   for (i=0; i<a->mbs; i++) {
182:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
183:     if (vary[i]==bs) iary[count++] = i;
184:   }
185:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
186:   ISRestoreIndices(isrow,&irow);
187:   ISRestoreIndices(iscol,&icol);
188:   PetscFree(vary);

190:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
191:   ISDestroy(is1);
192:   ISDestroy(is2);
193:   return(0);
194: }

196: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
197: {
198:   int ierr,i;

201:   if (scall == MAT_INITIAL_MATRIX) {
202:     PetscMalloc((n+1)*sizeof(Mat),B);
203:   }

205:   for (i=0; i<n; i++) {
206:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
207:   }
208:   return(0);
209: }


212: /* -------------------------------------------------------*/
213: /* Should check that shapes of vectors and matrices match */
214: /* -------------------------------------------------------*/
215:  #include petscblaslapack.h

217: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
218: {
219:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
220:   PetscScalar     *x,*z,sum;
221:   MatScalar       *v;
222:   int             mbs=a->mbs,i,*idx,*ii,n,ierr;

225:   VecGetArray(xx,&x);
226:   VecGetArray(zz,&z);

228:   idx   = a->j;
229:   v     = a->a;
230:   ii    = a->i;

232:   for (i=0; i<mbs; i++) {
233:     n    = ii[1] - ii[0]; ii++;
234:     sum  = 0.0;
235:     while (n--) sum += *v++ * x[*idx++];
236:     z[i] = sum;
237:   }
238:   VecRestoreArray(xx,&x);
239:   VecRestoreArray(zz,&z);
240:   PetscLogFlops(2*a->nz - A->m);
241:   return(0);
242: }

244: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
245: {
246:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
247:   PetscScalar     *x,*z,*xb,sum1,sum2;
248:   PetscScalar     x1,x2;
249:   MatScalar       *v;
250:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

253:   VecGetArray(xx,&x);
254:   VecGetArray(zz,&z);

256:   idx   = a->j;
257:   v     = a->a;
258:   ii    = a->i;

260:   for (i=0; i<mbs; i++) {
261:     n  = ii[1] - ii[0]; ii++;
262:     sum1 = 0.0; sum2 = 0.0;
263:     for (j=0; j<n; j++) {
264:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
265:       sum1 += v[0]*x1 + v[2]*x2;
266:       sum2 += v[1]*x1 + v[3]*x2;
267:       v += 4;
268:     }
269:     z[0] = sum1; z[1] = sum2;
270:     z += 2;
271:   }
272:   VecRestoreArray(xx,&x);
273:   VecRestoreArray(zz,&z);
274:   PetscLogFlops(8*a->nz - A->m);
275:   return(0);
276: }

278: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
279: {
280:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
281:   PetscScalar  *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
282:   MatScalar    *v;
283:   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;

285: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
286: #pragma disjoint(*v,*z,*xb)
287: #endif

290:   VecGetArray(xx,&x);
291:   VecGetArray(zz,&z);

293:   idx   = a->j;
294:   v     = a->a;
295:   ii    = a->i;

297:   for (i=0; i<mbs; i++) {
298:     n  = ii[1] - ii[0]; ii++;
299:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
300:     for (j=0; j<n; j++) {
301:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
302:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
303:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
304:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
305:       v += 9;
306:     }
307:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
308:     z += 3;
309:   }
310:   VecRestoreArray(xx,&x);
311:   VecRestoreArray(zz,&z);
312:   PetscLogFlops(18*a->nz - A->m);
313:   return(0);
314: }

316: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
317: {
318:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
319:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
320:   MatScalar       *v;
321:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

324:   VecGetArray(xx,&x);
325:   VecGetArray(zz,&z);

327:   idx   = a->j;
328:   v     = a->a;
329:   ii    = a->i;

331:   for (i=0; i<mbs; i++) {
332:     n  = ii[1] - ii[0]; ii++;
333:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
334:     for (j=0; j<n; j++) {
335:       xb = x + 4*(*idx++);
336:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
337:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
338:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
339:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
340:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
341:       v += 16;
342:     }
343:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
344:     z += 4;
345:   }
346:   VecRestoreArray(xx,&x);
347:   VecRestoreArray(zz,&z);
348:   PetscLogFlops(32*a->nz - A->m);
349:   return(0);
350: }

352: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
353: {
354:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
355:   PetscScalar     sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
356:   MatScalar       *v;
357:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

360:   VecGetArray(xx,&x);
361:   VecGetArray(zz,&z);

363:   idx   = a->j;
364:   v     = a->a;
365:   ii    = a->i;

367:   for (i=0; i<mbs; i++) {
368:     n  = ii[1] - ii[0]; ii++;
369:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
370:     for (j=0; j<n; j++) {
371:       xb = x + 5*(*idx++);
372:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
373:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
374:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
375:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
376:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
377:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
378:       v += 25;
379:     }
380:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
381:     z += 5;
382:   }
383:   VecRestoreArray(xx,&x);
384:   VecRestoreArray(zz,&z);
385:   PetscLogFlops(50*a->nz - A->m);
386:   return(0);
387: }


390: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
391: {
392:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
393:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
394:   PetscScalar     x1,x2,x3,x4,x5,x6;
395:   MatScalar       *v;
396:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

399:   VecGetArray(xx,&x);
400:   VecGetArray(zz,&z);

402:   idx   = a->j;
403:   v     = a->a;
404:   ii    = a->i;

406:   for (i=0; i<mbs; i++) {
407:     n  = ii[1] - ii[0]; ii++;
408:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
409:     for (j=0; j<n; j++) {
410:       xb = x + 6*(*idx++);
411:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
412:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
413:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
414:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
415:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
416:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
417:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
418:       v += 36;
419:     }
420:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
421:     z += 6;
422:   }

424:   VecRestoreArray(xx,&x);
425:   VecRestoreArray(zz,&z);
426:   PetscLogFlops(72*a->nz - A->m);
427:   return(0);
428: }
429: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
430: {
431:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
432:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
433:   PetscScalar     x1,x2,x3,x4,x5,x6,x7;
434:   MatScalar       *v;
435:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

438:   VecGetArray(xx,&x);
439:   VecGetArray(zz,&z);

441:   idx   = a->j;
442:   v     = a->a;
443:   ii    = a->i;

445:   for (i=0; i<mbs; i++) {
446:     n  = ii[1] - ii[0]; ii++;
447:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
448:     for (j=0; j<n; j++) {
449:       xb = x + 7*(*idx++);
450:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
451:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
452:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
453:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
454:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
455:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
456:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
457:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
458:       v += 49;
459:     }
460:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
461:     z += 7;
462:   }

464:   VecRestoreArray(xx,&x);
465:   VecRestoreArray(zz,&z);
466:   PetscLogFlops(98*a->nz - A->m);
467:   return(0);
468: }

470: /*
471:     This will not work with MatScalar == float because it calls the BLAS
472: */
473: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
474: {
475:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
476:   PetscScalar     *x,*z,*xb,*work,*workt;
477:   MatScalar       *v;
478:   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
479:   int             ncols,k;

482:   VecGetArray(xx,&x);
483:   VecGetArray(zz,&z);

485:   idx   = a->j;
486:   v     = a->a;
487:   ii    = a->i;


490:   if (!a->mult_work) {
491:     k    = PetscMax(A->m,A->n);
492:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
493:   }
494:   work = a->mult_work;
495:   for (i=0; i<mbs; i++) {
496:     n     = ii[1] - ii[0]; ii++;
497:     ncols = n*bs;
498:     workt = work;
499:     for (j=0; j<n; j++) {
500:       xb = x + bs*(*idx++);
501:       for (k=0; k<bs; k++) workt[k] = xb[k];
502:       workt += bs;
503:     }
504:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
505:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
506:     v += n*bs2;
507:     z += bs;
508:   }
509:   VecRestoreArray(xx,&x);
510:   VecRestoreArray(zz,&z);
511:   PetscLogFlops(2*a->nz*bs2 - A->m);
512:   return(0);
513: }

515: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
516: {
517:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
518:   PetscScalar     *x,*y,*z,sum;
519:   MatScalar       *v;
520:   int             ierr,mbs=a->mbs,i,*idx,*ii,n;

523:   VecGetArray(xx,&x);
524:   VecGetArray(yy,&y);
525:   if (zz != yy) {
526:     VecGetArray(zz,&z);
527:   } else {
528:     z = y;
529:   }

531:   idx   = a->j;
532:   v     = a->a;
533:   ii    = a->i;

535:   for (i=0; i<mbs; i++) {
536:     n    = ii[1] - ii[0]; ii++;
537:     sum  = y[i];
538:     while (n--) sum += *v++ * x[*idx++];
539:     z[i] = sum;
540:   }
541:   VecRestoreArray(xx,&x);
542:   VecRestoreArray(yy,&y);
543:   if (zz != yy) {
544:     VecRestoreArray(zz,&z);
545:   }
546:   PetscLogFlops(2*a->nz);
547:   return(0);
548: }

550: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
551: {
552:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
553:   PetscScalar     *x,*y,*z,*xb,sum1,sum2;
554:   PetscScalar     x1,x2;
555:   MatScalar       *v;
556:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

559:   VecGetArray(xx,&x);
560:   VecGetArray(yy,&y);
561:   if (zz != yy) {
562:     VecGetArray(zz,&z);
563:   } else {
564:     z = y;
565:   }

567:   idx   = a->j;
568:   v     = a->a;
569:   ii    = a->i;

571:   for (i=0; i<mbs; i++) {
572:     n  = ii[1] - ii[0]; ii++;
573:     sum1 = y[0]; sum2 = y[1];
574:     for (j=0; j<n; j++) {
575:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
576:       sum1 += v[0]*x1 + v[2]*x2;
577:       sum2 += v[1]*x1 + v[3]*x2;
578:       v += 4;
579:     }
580:     z[0] = sum1; z[1] = sum2;
581:     z += 2; y += 2;
582:   }
583:   VecRestoreArray(xx,&x);
584:   VecRestoreArray(yy,&y);
585:   if (zz != yy) {
586:     VecRestoreArray(zz,&z);
587:   }
588:   PetscLogFlops(4*a->nz);
589:   return(0);
590: }

592: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
593: {
594:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
595:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
596:   MatScalar       *v;
597:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

600:   VecGetArray(xx,&x);
601:   VecGetArray(yy,&y);
602:   if (zz != yy) {
603:     VecGetArray(zz,&z);
604:   } else {
605:     z = y;
606:   }

608:   idx   = a->j;
609:   v     = a->a;
610:   ii    = a->i;

612:   for (i=0; i<mbs; i++) {
613:     n  = ii[1] - ii[0]; ii++;
614:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
615:     for (j=0; j<n; j++) {
616:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
617:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
618:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
619:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
620:       v += 9;
621:     }
622:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
623:     z += 3; y += 3;
624:   }
625:   VecRestoreArray(xx,&x);
626:   VecRestoreArray(yy,&y);
627:   if (zz != yy) {
628:     VecRestoreArray(zz,&z);
629:   }
630:   PetscLogFlops(18*a->nz);
631:   return(0);
632: }

634: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
635: {
636:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
637:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
638:   MatScalar       *v;
639:   int             ierr,mbs=a->mbs,i,*idx,*ii;
640:   int             j,n;

643:   VecGetArray(xx,&x);
644:   VecGetArray(yy,&y);
645:   if (zz != yy) {
646:     VecGetArray(zz,&z);
647:   } else {
648:     z = y;
649:   }

651:   idx   = a->j;
652:   v     = a->a;
653:   ii    = a->i;

655:   for (i=0; i<mbs; i++) {
656:     n  = ii[1] - ii[0]; ii++;
657:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
658:     for (j=0; j<n; j++) {
659:       xb = x + 4*(*idx++);
660:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
661:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
662:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
663:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
664:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
665:       v += 16;
666:     }
667:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
668:     z += 4; y += 4;
669:   }
670:   VecRestoreArray(xx,&x);
671:   VecRestoreArray(yy,&y);
672:   if (zz != yy) {
673:     VecRestoreArray(zz,&z);
674:   }
675:   PetscLogFlops(32*a->nz);
676:   return(0);
677: }

679: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
680: {
681:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
682:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
683:   MatScalar       *v;
684:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

687:   VecGetArray(xx,&x);
688:   VecGetArray(yy,&y);
689:   if (zz != yy) {
690:     VecGetArray(zz,&z);
691:   } else {
692:     z = y;
693:   }

695:   idx   = a->j;
696:   v     = a->a;
697:   ii    = a->i;

699:   for (i=0; i<mbs; i++) {
700:     n  = ii[1] - ii[0]; ii++;
701:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
702:     for (j=0; j<n; j++) {
703:       xb = x + 5*(*idx++);
704:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
705:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
706:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
707:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
708:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
709:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
710:       v += 25;
711:     }
712:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
713:     z += 5; y += 5;
714:   }
715:   VecRestoreArray(xx,&x);
716:   VecRestoreArray(yy,&y);
717:   if (zz != yy) {
718:     VecRestoreArray(zz,&z);
719:   }
720:   PetscLogFlops(50*a->nz);
721:   return(0);
722: }
723: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
724: {
725:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
726:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
727:   PetscScalar     x1,x2,x3,x4,x5,x6;
728:   MatScalar       *v;
729:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

732:   VecGetArray(xx,&x);
733:   VecGetArray(yy,&y);
734:   if (zz != yy) {
735:     VecGetArray(zz,&z);
736:   } else {
737:     z = y;
738:   }

740:   idx   = a->j;
741:   v     = a->a;
742:   ii    = a->i;

744:   for (i=0; i<mbs; i++) {
745:     n  = ii[1] - ii[0]; ii++;
746:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
747:     for (j=0; j<n; j++) {
748:       xb = x + 6*(*idx++);
749:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
750:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
751:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
752:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
753:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
754:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
755:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
756:       v += 36;
757:     }
758:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
759:     z += 6; y += 6;
760:   }
761:   VecRestoreArray(xx,&x);
762:   VecRestoreArray(yy,&y);
763:   if (zz != yy) {
764:     VecRestoreArray(zz,&z);
765:   }
766:   PetscLogFlops(72*a->nz);
767:   return(0);
768: }

770: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
771: {
772:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
773:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
774:   PetscScalar     x1,x2,x3,x4,x5,x6,x7;
775:   MatScalar       *v;
776:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

779:   VecGetArray(xx,&x);
780:   VecGetArray(yy,&y);
781:   if (zz != yy) {
782:     VecGetArray(zz,&z);
783:   } else {
784:     z = y;
785:   }

787:   idx   = a->j;
788:   v     = a->a;
789:   ii    = a->i;

791:   for (i=0; i<mbs; i++) {
792:     n  = ii[1] - ii[0]; ii++;
793:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
794:     for (j=0; j<n; j++) {
795:       xb = x + 7*(*idx++);
796:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
797:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
798:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
799:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
800:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
801:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
802:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
803:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
804:       v += 49;
805:     }
806:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
807:     z += 7; y += 7;
808:   }
809:   VecRestoreArray(xx,&x);
810:   VecRestoreArray(yy,&y);
811:   if (zz != yy) {
812:     VecRestoreArray(zz,&z);
813:   }
814:   PetscLogFlops(98*a->nz);
815:   return(0);
816: }

818: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
819: {
820:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
821:   PetscScalar    *x,*z,*xb,*work,*workt,*y;
822:   MatScalar      *v;
823:   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
824:   int            ncols,k;

827:   VecGetArray(xx,&x);
828:   VecGetArray(zz,&z);
829:   if (zz != yy) {
830:     VecGetArrayFast(yy,&y);
831:     PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
832:     VecRestoreArrayFast(yy,&y);
833:   }

835:   idx   = a->j;
836:   v     = a->a;
837:   ii    = a->i;


840:   if (!a->mult_work) {
841:     k    = PetscMax(A->m,A->n);
842:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
843:   }
844:   work = a->mult_work;
845:   for (i=0; i<mbs; i++) {
846:     n     = ii[1] - ii[0]; ii++;
847:     ncols = n*bs;
848:     workt = work;
849:     for (j=0; j<n; j++) {
850:       xb = x + bs*(*idx++);
851:       for (k=0; k<bs; k++) workt[k] = xb[k];
852:       workt += bs;
853:     }
854:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
855:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
856:     v += n*bs2;
857:     z += bs;
858:   }
859:   VecRestoreArray(xx,&x);
860:   VecRestoreArray(zz,&z);
861:   PetscLogFlops(2*a->nz*bs2);
862:   return(0);
863: }

865: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
866: {
867:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
868:   PetscScalar     *xg,*zg,*zb,zero = 0.0;
869:   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
870:   MatScalar       *v;
871:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
872:   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

875:   VecSet(&zero,zz);
876:   VecGetArray(xx,&xg); x = xg;
877:   VecGetArray(zz,&zg); z = zg;

879:   idx   = a->j;
880:   v     = a->a;
881:   ii    = a->i;
882:   xb    = x;
883:   switch (bs) {
884:   case 1:
885:     for (i=0; i<mbs; i++) {
886:       n  = ii[1] - ii[0]; ii++;
887:       x1 = xb[0];
888:       ib = idx + ai[i];
889:       for (j=0; j<n; j++) {
890:         rval    = ib[j];
891:         z[rval] += *v * x1;
892:         v++;
893:       }
894:       xb++;
895:     }
896:     break;
897:   case 2:
898:     for (i=0; i<mbs; i++) {
899:       n  = ii[1] - ii[0]; ii++;
900:       x1 = xb[0]; x2 = xb[1];
901:       ib = idx + ai[i];
902:       for (j=0; j<n; j++) {
903:         rval      = ib[j]*2;
904:         z[rval++] += v[0]*x1 + v[1]*x2;
905:         z[rval]   += v[2]*x1 + v[3]*x2;
906:         v  += 4;
907:       }
908:       xb += 2;
909:     }
910:     break;
911:   case 3:
912:     for (i=0; i<mbs; i++) {
913:       n  = ii[1] - ii[0]; ii++;
914:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
915:       ib = idx + ai[i];
916:       for (j=0; j<n; j++) {
917:         rval      = ib[j]*3;
918:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
919:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
920:         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
921:         v  += 9;
922:       }
923:       xb += 3;
924:     }
925:     break;
926:   case 4:
927:     for (i=0; i<mbs; i++) {
928:       n  = ii[1] - ii[0]; ii++;
929:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
930:       ib = idx + ai[i];
931:       for (j=0; j<n; j++) {
932:         rval      = ib[j]*4;
933:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
934:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
935:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
936:         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
937:         v  += 16;
938:       }
939:       xb += 4;
940:     }
941:     break;
942:   case 5:
943:     for (i=0; i<mbs; i++) {
944:       n  = ii[1] - ii[0]; ii++;
945:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
946:       x4 = xb[3]; x5 = xb[4];
947:       ib = idx + ai[i];
948:       for (j=0; j<n; j++) {
949:         rval      = ib[j]*5;
950:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
951:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
952:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
953:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
954:         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
955:         v  += 25;
956:       }
957:       xb += 5;
958:     }
959:     break;
960:   case 6:
961:     for (i=0; i<mbs; i++) {
962:       n  = ii[1] - ii[0]; ii++;
963:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
964:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
965:       ib = idx + ai[i];
966:       for (j=0; j<n; j++) {
967:         rval      = ib[j]*6;
968:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
969:         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
970:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
971:         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
972:         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
973:         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
974:         v  += 36;
975:       }
976:       xb += 6;
977:     }
978:     break;
979:   case 7:
980:     for (i=0; i<mbs; i++) {
981:       n  = ii[1] - ii[0]; ii++;
982:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
983:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
984:       ib = idx + ai[i];
985:       for (j=0; j<n; j++) {
986:         rval      = ib[j]*7;
987:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
988:         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
989:         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
990:         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
991:         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
992:         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
993:         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
994:         v  += 49;
995:       }
996:       xb += 7;
997:     }
998:     break;
999:   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
1000:       int          ncols,k;
1001:       PetscScalar  *work,*workt;

1003:       if (!a->mult_work) {
1004:         k = PetscMax(A->m,A->n);
1005:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1006:       }
1007:       work = a->mult_work;
1008:       for (i=0; i<mbs; i++) {
1009:         n     = ii[1] - ii[0]; ii++;
1010:         ncols = n*bs;
1011:         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));
1012:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1013:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1014:         v += n*bs2;
1015:         x += bs;
1016:         workt = work;
1017:         for (j=0; j<n; j++) {
1018:           zb = z + bs*(*idx++);
1019:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1020:           workt += bs;
1021:         }
1022:       }
1023:     }
1024:   }
1025:   VecRestoreArray(xx,&xg);
1026:   VecRestoreArray(zz,&zg);
1027:   PetscLogFlops(2*a->nz*a->bs2 - A->n);
1028:   return(0);
1029: }

1031: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1033: {
1034:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1035:   PetscScalar     *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1036:   MatScalar       *v;
1037:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

1040:   if (yy != zz) { VecCopy(yy,zz); }
1041:   VecGetArray(xx,&xg); x = xg;
1042:   VecGetArray(zz,&zg); z = zg;


1045:   idx   = a->j;
1046:   v     = a->a;
1047:   ii    = a->i;
1048:   xb    = x;

1050:   switch (bs) {
1051:   case 1:
1052:     for (i=0; i<mbs; i++) {
1053:       n  = ii[1] - ii[0]; ii++;
1054:       x1 = xb[0];
1055:       ib = idx + ai[i];
1056:       for (j=0; j<n; j++) {
1057:         rval    = ib[j];
1058:         z[rval] += *v * x1;
1059:         v++;
1060:       }
1061:       xb++;
1062:     }
1063:     break;
1064:   case 2:
1065:     for (i=0; i<mbs; i++) {
1066:       n  = ii[1] - ii[0]; ii++;
1067:       x1 = xb[0]; x2 = xb[1];
1068:       ib = idx + ai[i];
1069:       for (j=0; j<n; j++) {
1070:         rval      = ib[j]*2;
1071:         z[rval++] += v[0]*x1 + v[1]*x2;
1072:         z[rval++] += v[2]*x1 + v[3]*x2;
1073:         v  += 4;
1074:       }
1075:       xb += 2;
1076:     }
1077:     break;
1078:   case 3:
1079:     for (i=0; i<mbs; i++) {
1080:       n  = ii[1] - ii[0]; ii++;
1081:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1082:       ib = idx + ai[i];
1083:       for (j=0; j<n; j++) {
1084:         rval      = ib[j]*3;
1085:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1086:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1087:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1088:         v  += 9;
1089:       }
1090:       xb += 3;
1091:     }
1092:     break;
1093:   case 4:
1094:     for (i=0; i<mbs; i++) {
1095:       n  = ii[1] - ii[0]; ii++;
1096:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1097:       ib = idx + ai[i];
1098:       for (j=0; j<n; j++) {
1099:         rval      = ib[j]*4;
1100:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1101:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1102:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1103:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1104:         v  += 16;
1105:       }
1106:       xb += 4;
1107:     }
1108:     break;
1109:   case 5:
1110:     for (i=0; i<mbs; i++) {
1111:       n  = ii[1] - ii[0]; ii++;
1112:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1113:       x4 = xb[3]; x5 = xb[4];
1114:       ib = idx + ai[i];
1115:       for (j=0; j<n; j++) {
1116:         rval      = ib[j]*5;
1117:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1118:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1119:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1120:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1121:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1122:         v  += 25;
1123:       }
1124:       xb += 5;
1125:     }
1126:     break;
1127:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1128:       int          ncols,k;
1129:       PetscScalar  *work,*workt;

1131:       if (!a->mult_work) {
1132:         k = PetscMax(A->m,A->n);
1133:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1134:       }
1135:       work = a->mult_work;
1136:       for (i=0; i<mbs; i++) {
1137:         n     = ii[1] - ii[0]; ii++;
1138:         ncols = n*bs;
1139:         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));
1140:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1141:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1142:         v += n*bs2;
1143:         x += bs;
1144:         workt = work;
1145:         for (j=0; j<n; j++) {
1146:           zb = z + bs*(*idx++);
1147:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1148:           workt += bs;
1149:         }
1150:       }
1151:     }
1152:   }
1153:   VecRestoreArray(xx,&xg);
1154:   VecRestoreArray(zz,&zg);
1155:   PetscLogFlops(2*a->nz*a->bs2);
1156:   return(0);
1157: }

1159: int MatScale_SeqBAIJ(PetscScalar *alpha,Mat inA)
1160: {
1161:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1162:   int         totalnz = a->bs2*a->nz;
1163: #if defined(PETSC_USE_MAT_SINGLE)
1164:   int         i;
1165: #else
1166:   int         one = 1;
1167: #endif

1170: #if defined(PETSC_USE_MAT_SINGLE)
1171:   for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1172: #else
1173:   BLscal_(&totalnz,alpha,a->a,&one);
1174: #endif
1175:   PetscLogFlops(totalnz);
1176:   return(0);
1177: }

1179: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1180: {
1181:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1182:   MatScalar   *v = a->a;
1183:   PetscReal   sum = 0.0;
1184:   int         i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;

1187:   if (type == NORM_FROBENIUS) {
1188:     for (i=0; i< bs2*nz; i++) {
1189: #if defined(PETSC_USE_COMPLEX)
1190:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1191: #else
1192:       sum += (*v)*(*v); v++;
1193: #endif
1194:     }
1195:     *norm = sqrt(sum);
1196:   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1197:     *norm = 0.0;
1198:     for (k=0; k<bs; k++) {
1199:       for (j=0; j<a->mbs; j++) {
1200:         v = a->a + bs2*a->i[j] + k;
1201:         sum = 0.0;
1202:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1203:           for (k1=0; k1<bs; k1++){
1204:             sum += PetscAbsScalar(*v);
1205:             v   += bs;
1206:           }
1207:         }
1208:         if (sum > *norm) *norm = sum;
1209:       }
1210:     }
1211:   } else {
1212:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1213:   }
1214:   return(0);
1215: }


1218: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1219: {
1220:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1221:   int         ierr;
1222:   PetscTruth  flag;

1225:   PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1226:   if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");

1228:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1229:   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1230:     *flg = PETSC_FALSE;
1231:     return(0);
1232:   }
1233: 
1234:   /* if the a->i are the same */
1235:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1236:   if (*flg == PETSC_FALSE) {
1237:     return(0);
1238:   }
1239: 
1240:   /* if a->j are the same */
1241:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1242:   if (*flg == PETSC_FALSE) {
1243:     return(0);
1244:   }
1245:   /* if a->a are the same */
1246:   PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1247:   return(0);
1248: 
1249: }

1251: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1252: {
1253:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
1254:   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1255:   PetscScalar  *x,zero = 0.0;
1256:   MatScalar    *aa,*aa_j;

1259:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1260:   bs   = a->bs;
1261:   aa   = a->a;
1262:   ai   = a->i;
1263:   aj   = a->j;
1264:   ambs = a->mbs;
1265:   bs2  = a->bs2;

1267:   VecSet(&zero,v);
1268:   VecGetArray(v,&x);
1269:   VecGetLocalSize(v,&n);
1270:   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1271:   for (i=0; i<ambs; i++) {
1272:     for (j=ai[i]; j<ai[i+1]; j++) {
1273:       if (aj[j] == i) {
1274:         row  = i*bs;
1275:         aa_j = aa+j*bs2;
1276:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1277:         break;
1278:       }
1279:     }
1280:   }
1281:   VecRestoreArray(v,&x);
1282:   return(0);
1283: }

1285: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1286: {
1287:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
1288:   PetscScalar  *l,*r,x,*li,*ri;
1289:   MatScalar    *aa,*v;
1290:   int          ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1293:   ai  = a->i;
1294:   aj  = a->j;
1295:   aa  = a->a;
1296:   m   = A->m;
1297:   n   = A->n;
1298:   bs  = a->bs;
1299:   mbs = a->mbs;
1300:   bs2 = a->bs2;
1301:   if (ll) {
1302:     VecGetArray(ll,&l);
1303:     VecGetLocalSize(ll,&lm);
1304:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1305:     for (i=0; i<mbs; i++) { /* for each block row */
1306:       M  = ai[i+1] - ai[i];
1307:       li = l + i*bs;
1308:       v  = aa + bs2*ai[i];
1309:       for (j=0; j<M; j++) { /* for each block */
1310:         for (k=0; k<bs2; k++) {
1311:           (*v++) *= li[k%bs];
1312:         }
1313:       }
1314:     }
1315:     VecRestoreArray(ll,&l);
1316:     PetscLogFlops(a->nz);
1317:   }
1318: 
1319:   if (rr) {
1320:     VecGetArray(rr,&r);
1321:     VecGetLocalSize(rr,&rn);
1322:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1323:     for (i=0; i<mbs; i++) { /* for each block row */
1324:       M  = ai[i+1] - ai[i];
1325:       v  = aa + bs2*ai[i];
1326:       for (j=0; j<M; j++) { /* for each block */
1327:         ri = r + bs*aj[ai[i]+j];
1328:         for (k=0; k<bs; k++) {
1329:           x = ri[k];
1330:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1331:         }
1332:       }
1333:     }
1334:     VecRestoreArray(rr,&r);
1335:     PetscLogFlops(a->nz);
1336:   }
1337:   return(0);
1338: }


1341: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1342: {
1343:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1346:   info->rows_global    = (double)A->m;
1347:   info->columns_global = (double)A->n;
1348:   info->rows_local     = (double)A->m;
1349:   info->columns_local  = (double)A->n;
1350:   info->block_size     = a->bs2;
1351:   info->nz_allocated   = a->maxnz;
1352:   info->nz_used        = a->bs2*a->nz;
1353:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1354:   info->assemblies   = A->num_ass;
1355:   info->mallocs      = a->reallocs;
1356:   info->memory       = A->mem;
1357:   if (A->factor) {
1358:     info->fill_ratio_given  = A->info.fill_ratio_given;
1359:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1360:     info->factor_mallocs    = A->info.factor_mallocs;
1361:   } else {
1362:     info->fill_ratio_given  = 0;
1363:     info->fill_ratio_needed = 0;
1364:     info->factor_mallocs    = 0;
1365:   }
1366:   return(0);
1367: }


1370: int MatZeroEntries_SeqBAIJ(Mat A)
1371: {
1372:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1373:   int         ierr;

1376:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1377:   return(0);
1378: }