Actual source code: baij2.c

  1: #define PETSCMAT_DLL

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/inline/spops.h
 5:  #include src/inline/ilu.h
 6:  #include petscbt.h

 10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
 15:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 16:   PetscBT        table;

 19:   m     = a->mbs;
 20:   ai    = a->i;
 21:   aj    = a->j;
 22:   bs    = A->bs;

 24:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 26:   PetscBTCreate(m,table);
 27:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 28:   PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);
 34: 
 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n ; ++j){
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 42:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 43:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(is[i]);
 47: 
 48:     k = 0;
 49:     for (j=0; j<ov; j++){ /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end ; l++){
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++)
 64:         nidx2[j*bs+k] = nidx[j]*bs+k;
 65:     }
 66:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 67:   }
 68:   PetscBTDestroy(table);
 69:   PetscFree(nidx);
 70:   PetscFree(nidx2);
 71:   return(0);
 72: }

 76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   PetscInt       *irow,*icol,nrows,ncols,*ssmap,bs=A->bs,bs2=a->bs2;
 83:   PetscInt       *aj = a->j,*ai = a->i;
 84:   MatScalar      *mat_a;
 85:   Mat            C;
 86:   PetscTruth     flag;

 89:   ISSorted(iscol,(PetscTruth*)&i);
 90:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 92:   ISGetIndices(isrow,&irow);
 93:   ISGetIndices(iscol,&icol);
 94:   ISGetLocalSize(isrow,&nrows);
 95:   ISGetLocalSize(iscol,&ncols);

 97:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
 98:   ssmap = smap;
 99:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
101:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102:   /* determine lens of each row */
103:   for (i=0; i<nrows; i++) {
104:     kstart  = ai[irow[i]];
105:     kend    = kstart + a->ilen[irow[i]];
106:     lens[i] = 0;
107:       for (k=kstart; k<kend; k++) {
108:         if (ssmap[aj[k]]) {
109:           lens[i]++;
110:         }
111:       }
112:     }
113:   /* Create and fill new matrix */
114:   if (scall == MAT_REUSE_MATRIX) {
115:     c = (Mat_SeqBAIJ *)((*B)->data);

117:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119:     if (!flag) {
120:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121:     }
122:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123:     C = *B;
124:   } else {
125:     MatCreate(A->comm,&C);
126:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127:     MatSetType(C,A->type_name);
128:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129:   }
130:   c = (Mat_SeqBAIJ *)(C->data);
131:   for (i=0; i<nrows; i++) {
132:     row    = irow[i];
133:     kstart = ai[row];
134:     kend   = kstart + a->ilen[row];
135:     mat_i  = c->i[i];
136:     mat_j  = c->j + mat_i;
137:     mat_a  = c->a + mat_i*bs2;
138:     mat_ilen = c->ilen + i;
139:     for (k=kstart; k<kend; k++) {
140:       if ((tcol=ssmap[a->j[k]])) {
141:         *mat_j++ = tcol - 1;
142:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143:         mat_a   += bs2;
144:         (*mat_ilen)++;
145:       }
146:     }
147:   }
148: 
149:   /* Free work space */
150:   ISRestoreIndices(iscol,&icol);
151:   PetscFree(smap);
152:   PetscFree(lens);
153:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155: 
156:   ISRestoreIndices(isrow,&irow);
157:   *B = C;
158:   return(0);
159: }

163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
166:   IS             is1,is2;
168:   PetscInt       *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->bs,count;

171:   ISGetIndices(isrow,&irow);
172:   ISGetIndices(iscol,&icol);
173:   ISGetLocalSize(isrow,&nrows);
174:   ISGetLocalSize(iscol,&ncols);
175: 
176:   /* Verify if the indices corespond to each element in a block 
177:    and form the IS with compressed IS */
178:   PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179:   iary = vary + a->mbs;
180:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182:   count = 0;
183:   for (i=0; i<a->mbs; i++) {
184:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185:     if (vary[i]==bs) iary[count++] = i;
186:   }
187:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188: 
189:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191:   count = 0;
192:   for (i=0; i<a->mbs; i++) {
193:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194:     if (vary[i]==bs) iary[count++] = i;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197:   ISRestoreIndices(isrow,&irow);
198:   ISRestoreIndices(iscol,&icol);
199:   PetscFree(vary);

201:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202:   ISDestroy(is1);
203:   ISDestroy(is2);
204:   return(0);
205: }

209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212:   PetscInt       i;

215:   if (scall == MAT_INITIAL_MATRIX) {
216:     PetscMalloc((n+1)*sizeof(Mat),B);
217:   }

219:   for (i=0; i<n; i++) {
220:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221:   }
222:   return(0);
223: }


226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229:  #include petscblaslapack.h

233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
236:   PetscScalar    *x,*z,sum;
237:   MatScalar      *v;
239:   PetscInt       mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
240:   PetscTruth     usecprow=a->compressedrow.use;

243:   VecGetArray(xx,&x);
244:   VecGetArray(zz,&z);

246:   idx = a->j;
247:   v   = a->a;
248:   if (usecprow){
249:     mbs  = a->compressedrow.nrows;
250:     ii   = a->compressedrow.i;
251:     ridx = a->compressedrow.rindex;
252:   } else {
253:     mbs = a->mbs;
254:     ii  = a->i;
255:   }

257:   for (i=0; i<mbs; i++) {
258:     n    = ii[1] - ii[0]; ii++;
259:     sum  = 0.0;
260:     while (n--) sum += *v++ * x[*idx++];
261:     if (usecprow){
262:       z[ridx[i]] = sum;
263:     } else {
264:       z[i] = sum;
265:     }
266:   }
267:   VecRestoreArray(xx,&x);
268:   VecRestoreArray(zz,&z);
269:   PetscLogFlops(2*a->nz - mbs);
270:   return(0);
271: }

275: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
276: {
277:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
278:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,*zarray;
279:   PetscScalar    x1,x2;
280:   MatScalar      *v;
282:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
283:   PetscTruth     usecprow=a->compressedrow.use;

286:   VecGetArray(xx,&x);
287:   VecGetArray(zz,&zarray);

289:   idx = a->j;
290:   v   = a->a;
291:   if (usecprow){
292:     mbs  = a->compressedrow.nrows;
293:     ii   = a->compressedrow.i;
294:     ridx = a->compressedrow.rindex;
295:   } else {
296:     mbs = a->mbs;
297:     ii  = a->i;
298:     z   = zarray;
299:   }

301:   for (i=0; i<mbs; i++) {
302:     n  = ii[1] - ii[0]; ii++;
303:     sum1 = 0.0; sum2 = 0.0;
304:     for (j=0; j<n; j++) {
305:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
306:       sum1 += v[0]*x1 + v[2]*x2;
307:       sum2 += v[1]*x1 + v[3]*x2;
308:       v += 4;
309:     }
310:     if (usecprow) z = zarray + 2*ridx[i];
311:     z[0] = sum1; z[1] = sum2;
312:     if (!usecprow) z += 2;
313:   }
314:   VecRestoreArray(xx,&x);
315:   VecRestoreArray(zz,&zarray);
316:   PetscLogFlops(8*a->nz - 2*mbs);
317:   return(0);
318: }

322: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
323: {
324:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
325:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*zarray;
326:   MatScalar      *v;
328:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
329:   PetscTruth     usecprow=a->compressedrow.use;
330: 

332: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
333: #pragma disjoint(*v,*z,*xb)
334: #endif

337:   VecGetArray(xx,&x);
338:   VecGetArray(zz,&zarray);

340:   idx = a->j;
341:   v   = a->a;
342:   if (usecprow){
343:     mbs  = a->compressedrow.nrows;
344:     ii   = a->compressedrow.i;
345:     ridx = a->compressedrow.rindex;
346:   } else {
347:     mbs = a->mbs;
348:     ii  = a->i;
349:     z   = zarray;
350:   }

352:   for (i=0; i<mbs; i++) {
353:     n  = ii[1] - ii[0]; ii++;
354:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
355:     for (j=0; j<n; j++) {
356:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
357:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
358:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
359:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
360:       v += 9;
361:     }
362:     if (usecprow) z = zarray + 3*ridx[i];
363:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
364:     if (!usecprow) z += 3;
365:   }
366:   VecRestoreArray(xx,&x);
367:   VecRestoreArray(zz,&zarray);
368:   PetscLogFlops(18*a->nz - 3*mbs);
369:   return(0);
370: }

374: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
375: {
376:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
377:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
378:   MatScalar      *v;
380:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
381:   PetscTruth     usecprow=a->compressedrow.use;

384:   VecGetArray(xx,&x);
385:   VecGetArray(zz,&zarray);

387:   idx = a->j;
388:   v   = a->a;
389:   if (usecprow){
390:     mbs  = a->compressedrow.nrows;
391:     ii   = a->compressedrow.i;
392:     ridx = a->compressedrow.rindex;
393:   } else {
394:     mbs = a->mbs;
395:     ii  = a->i;
396:     z   = zarray;
397:   }

399:   for (i=0; i<mbs; i++) {
400:     n  = ii[1] - ii[0]; ii++;
401:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
402:     for (j=0; j<n; j++) {
403:       xb = x + 4*(*idx++);
404:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
405:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
406:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
407:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
408:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
409:       v += 16;
410:     }
411:     if (usecprow) z = zarray + 4*ridx[i];
412:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
413:     if (!usecprow) z += 4;
414:   }
415:   VecRestoreArray(xx,&x);
416:   VecRestoreArray(zz,&zarray);
417:   PetscLogFlops(32*a->nz - 4*mbs);
418:   return(0);
419: }

423: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
424: {
425:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
426:   PetscScalar    sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z = 0,*x,*zarray;
427:   MatScalar      *v;
429:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
430:   PetscTruth     usecprow=a->compressedrow.use;

433:   VecGetArray(xx,&x);
434:   VecGetArray(zz,&zarray);

436:   idx = a->j;
437:   v   = a->a;
438:   if (usecprow){
439:     mbs  = a->compressedrow.nrows;
440:     ii   = a->compressedrow.i;
441:     ridx = a->compressedrow.rindex;
442:   } else {
443:     mbs = a->mbs;
444:     ii  = a->i;
445:     z   = zarray;
446:   }

448:   for (i=0; i<mbs; i++) {
449:     n  = ii[1] - ii[0]; ii++;
450:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
451:     for (j=0; j<n; j++) {
452:       xb = x + 5*(*idx++);
453:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
454:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
455:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
456:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
457:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
458:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
459:       v += 25;
460:     }
461:     if (usecprow) z = zarray + 5*ridx[i];
462:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
463:     if (!usecprow) z += 5;
464:   }
465:   VecRestoreArray(xx,&x);
466:   VecRestoreArray(zz,&zarray);
467:   PetscLogFlops(50*a->nz - 5*mbs);
468:   return(0);
469: }


474: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
475: {
476:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
477:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
478:   PetscScalar    x1,x2,x3,x4,x5,x6,*zarray;
479:   MatScalar      *v;
481:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
482:   PetscTruth     usecprow=a->compressedrow.use;

485:   VecGetArray(xx,&x);
486:   VecGetArray(zz,&zarray);

488:   idx = a->j;
489:   v   = a->a;
490:   if (usecprow){
491:     mbs  = a->compressedrow.nrows;
492:     ii   = a->compressedrow.i;
493:     ridx = a->compressedrow.rindex;
494:   } else {
495:     mbs = a->mbs;
496:     ii  = a->i;
497:     z   = zarray;
498:   }

500:   for (i=0; i<mbs; i++) {
501:     n  = ii[1] - ii[0]; ii++;
502:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
503:     for (j=0; j<n; j++) {
504:       xb = x + 6*(*idx++);
505:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
506:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
507:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
508:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
509:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
510:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
511:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
512:       v += 36;
513:     }
514:     if (usecprow) z = zarray + 6*ridx[i];
515:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
516:     if (!usecprow) z += 6;
517:   }

519:   VecRestoreArray(xx,&x);
520:   VecRestoreArray(zz,&zarray);
521:   PetscLogFlops(72*a->nz - 6*mbs);
522:   return(0);
523: }
526: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
527: {
528:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
529:   PetscScalar    *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
530:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*zarray;
531:   MatScalar      *v;
533:   PetscInt       mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
534:   PetscTruth     usecprow=a->compressedrow.use;

537:   VecGetArray(xx,&x);
538:   VecGetArray(zz,&zarray);

540:   idx = a->j;
541:   v   = a->a;
542:   if (usecprow){
543:     mbs    = a->compressedrow.nrows;
544:     ii     = a->compressedrow.i;
545:     ridx = a->compressedrow.rindex;
546:   } else {
547:     mbs = a->mbs;
548:     ii  = a->i;
549:     z   = zarray;
550:   }

552:   for (i=0; i<mbs; i++) {
553:     n  = ii[1] - ii[0]; ii++;
554:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
555:     for (j=0; j<n; j++) {
556:       xb = x + 7*(*idx++);
557:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
558:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
559:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
560:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
561:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
562:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
563:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
564:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
565:       v += 49;
566:     }
567:     if (usecprow) z = zarray + 7*ridx[i];
568:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
569:     if (!usecprow) z += 7;
570:   }

572:   VecRestoreArray(xx,&x);
573:   VecRestoreArray(zz,&zarray);
574:   PetscLogFlops(98*a->nz - 7*mbs);
575:   return(0);
576: }

578: /*
579:     This will not work with MatScalar == float because it calls the BLAS
580: */
583: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
584: {
585:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
586:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
587:   MatScalar      *v;
589:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
590:   PetscInt       ncols,k,*ridx=PETSC_NULL;
591:   PetscTruth     usecprow=a->compressedrow.use;

594:   VecGetArray(xx,&x);
595:   VecGetArray(zz,&zarray);

597:   idx = a->j;
598:   v   = a->a;
599:   if (usecprow){
600:     mbs  = a->compressedrow.nrows;
601:     ii   = a->compressedrow.i;
602:     ridx = a->compressedrow.rindex;
603:   } else {
604:     mbs = a->mbs;
605:     ii  = a->i;
606:     z   = zarray;
607:   }

609:   if (!a->mult_work) {
610:     k    = PetscMax(A->m,A->n);
611:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
612:   }
613:   work = a->mult_work;
614:   for (i=0; i<mbs; i++) {
615:     n     = ii[1] - ii[0]; ii++;
616:     ncols = n*bs;
617:     workt = work;
618:     for (j=0; j<n; j++) {
619:       xb = x + bs*(*idx++);
620:       for (k=0; k<bs; k++) workt[k] = xb[k];
621:       workt += bs;
622:     }
623:     if (usecprow) z = zarray + bs*ridx[i];
624:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
625:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
626:     v += n*bs2;
627:     if (!usecprow) z += bs;
628:   }
629:   VecRestoreArray(xx,&x);
630:   VecRestoreArray(zz,&zarray);
631:   PetscLogFlops(2*a->nz*bs2 - bs*mbs);
632:   return(0);
633: }

637: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
638: {
639:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
640:   PetscScalar    *x,*y = 0,*z = 0,sum,*yarray,*zarray;
641:   MatScalar      *v;
643:   PetscInt       mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
644:   PetscTruth     usecprow=a->compressedrow.use;

647:   VecGetArray(xx,&x);
648:   VecGetArray(yy,&yarray);
649:   if (zz != yy) {
650:     VecGetArray(zz,&zarray);
651:   } else {
652:     zarray = yarray;
653:   }

655:   idx = a->j;
656:   v   = a->a;
657:   if (usecprow){
658:     if (zz != yy){
659:       PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
660:     }
661:     mbs  = a->compressedrow.nrows;
662:     ii   = a->compressedrow.i;
663:     ridx = a->compressedrow.rindex;
664:   } else {
665:     ii  = a->i;
666:     y   = yarray;
667:     z   = zarray;
668:   }

670:   for (i=0; i<mbs; i++) {
671:     n    = ii[1] - ii[0]; ii++;
672:     if (usecprow){
673:       z = zarray + ridx[i];
674:       y = yarray + ridx[i];
675:     }
676:     sum = y[0];
677:     while (n--) sum += *v++ * x[*idx++];
678:     z[0] = sum;
679:     if (!usecprow){
680:       z++; y++;
681:     }
682:   }
683:   VecRestoreArray(xx,&x);
684:   VecRestoreArray(yy,&yarray);
685:   if (zz != yy) {
686:     VecRestoreArray(zz,&zarray);
687:   }
688:   PetscLogFlops(2*a->nz);
689:   return(0);
690: }

694: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
695: {
696:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
697:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
698:   PetscScalar    x1,x2,*yarray,*zarray;
699:   MatScalar      *v;
701:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
702:   PetscTruth     usecprow=a->compressedrow.use;

705:   VecGetArray(xx,&x);
706:   VecGetArray(yy,&yarray);
707:   if (zz != yy) {
708:     VecGetArray(zz,&zarray);
709:   } else {
710:     zarray = yarray;
711:   }

713:   idx = a->j;
714:   v   = a->a;
715:   if (usecprow){
716:     if (zz != yy){
717:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
718:     }
719:     mbs  = a->compressedrow.nrows;
720:     ii   = a->compressedrow.i;
721:     ridx = a->compressedrow.rindex;
722:     if (zz != yy){
723:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
724:     }
725:   } else {
726:     ii  = a->i;
727:     y   = yarray;
728:     z   = zarray;
729:   }

731:   for (i=0; i<mbs; i++) {
732:     n  = ii[1] - ii[0]; ii++;
733:     if (usecprow){
734:       z = zarray + 2*ridx[i];
735:       y = yarray + 2*ridx[i];
736:     }
737:     sum1 = y[0]; sum2 = y[1];
738:     for (j=0; j<n; j++) {
739:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
740:       sum1 += v[0]*x1 + v[2]*x2;
741:       sum2 += v[1]*x1 + v[3]*x2;
742:       v += 4;
743:     }
744:     z[0] = sum1; z[1] = sum2;
745:     if (!usecprow){
746:       z += 2; y += 2;
747:     }
748:   }
749:   VecRestoreArray(xx,&x);
750:   VecRestoreArray(yy,&yarray);
751:   if (zz != yy) {
752:     VecRestoreArray(zz,&zarray);
753:   }
754:   PetscLogFlops(4*a->nz);
755:   return(0);
756: }

760: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
761: {
762:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
763:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
764:   MatScalar      *v;
766:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
767:   PetscTruth     usecprow=a->compressedrow.use;

770:   VecGetArray(xx,&x);
771:   VecGetArray(yy,&yarray);
772:   if (zz != yy) {
773:     VecGetArray(zz,&zarray);
774:   } else {
775:     zarray = yarray;
776:   }

778:   idx = a->j;
779:   v   = a->a;
780:   if (usecprow){
781:     if (zz != yy){
782:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
783:     }
784:     mbs  = a->compressedrow.nrows;
785:     ii   = a->compressedrow.i;
786:     ridx = a->compressedrow.rindex;
787:   } else {
788:     ii  = a->i;
789:     y   = yarray;
790:     z   = zarray;
791:   }

793:   for (i=0; i<mbs; i++) {
794:     n  = ii[1] - ii[0]; ii++;
795:     if (usecprow){
796:       z = zarray + 3*ridx[i];
797:       y = yarray + 3*ridx[i];
798:     }
799:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
800:     for (j=0; j<n; j++) {
801:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
802:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
803:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
804:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
805:       v += 9;
806:     }
807:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
808:     if (!usecprow){
809:       z += 3; y += 3;
810:     }
811:   }
812:   VecRestoreArray(xx,&x);
813:   VecRestoreArray(yy,&yarray);
814:   if (zz != yy) {
815:     VecRestoreArray(zz,&zarray);
816:   }
817:   PetscLogFlops(18*a->nz);
818:   return(0);
819: }

823: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
824: {
825:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
826:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
827:   MatScalar      *v;
829:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
830:   PetscTruth     usecprow=a->compressedrow.use;

833:   VecGetArray(xx,&x);
834:   VecGetArray(yy,&yarray);
835:   if (zz != yy) {
836:     VecGetArray(zz,&zarray);
837:   } else {
838:     zarray = yarray;
839:   }

841:   idx   = a->j;
842:   v     = a->a;
843:   if (usecprow){
844:     if (zz != yy){
845:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
846:     }
847:     mbs  = a->compressedrow.nrows;
848:     ii   = a->compressedrow.i;
849:     ridx = a->compressedrow.rindex;
850:   } else {
851:     ii  = a->i;
852:     y   = yarray;
853:     z   = zarray;
854:   }

856:   for (i=0; i<mbs; i++) {
857:     n  = ii[1] - ii[0]; ii++;
858:     if (usecprow){
859:       z = zarray + 4*ridx[i];
860:       y = yarray + 4*ridx[i];
861:     }
862:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
863:     for (j=0; j<n; j++) {
864:       xb = x + 4*(*idx++);
865:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
866:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
867:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
868:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
869:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
870:       v += 16;
871:     }
872:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
873:     if (!usecprow){
874:       z += 4; y += 4;
875:     }
876:   }
877:   VecRestoreArray(xx,&x);
878:   VecRestoreArray(yy,&yarray);
879:   if (zz != yy) {
880:     VecRestoreArray(zz,&zarray);
881:   }
882:   PetscLogFlops(32*a->nz);
883:   return(0);
884: }

888: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
889: {
890:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
891:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
892:   PetscScalar    *yarray,*zarray;
893:   MatScalar      *v;
895:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
896:   PetscTruth     usecprow=a->compressedrow.use;

899:   VecGetArray(xx,&x);
900:   VecGetArray(yy,&yarray);
901:   if (zz != yy) {
902:     VecGetArray(zz,&zarray);
903:   } else {
904:     zarray = yarray;
905:   }

907:   idx = a->j;
908:   v   = a->a;
909:   if (usecprow){
910:     if (zz != yy){
911:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
912:     }
913:     mbs  = a->compressedrow.nrows;
914:     ii   = a->compressedrow.i;
915:     ridx = a->compressedrow.rindex;
916:   } else {
917:     ii  = a->i;
918:     y   = yarray;
919:     z   = zarray;
920:   }

922:   for (i=0; i<mbs; i++) {
923:     n  = ii[1] - ii[0]; ii++;
924:     if (usecprow){
925:       z = zarray + 5*ridx[i];
926:       y = yarray + 5*ridx[i];
927:     }
928:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
929:     for (j=0; j<n; j++) {
930:       xb = x + 5*(*idx++);
931:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
932:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
933:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
934:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
935:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
936:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
937:       v += 25;
938:     }
939:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
940:     if (!usecprow){
941:       z += 5; y += 5;
942:     }
943:   }
944:   VecRestoreArray(xx,&x);
945:   VecRestoreArray(yy,&yarray);
946:   if (zz != yy) {
947:     VecRestoreArray(zz,&zarray);
948:   }
949:   PetscLogFlops(50*a->nz);
950:   return(0);
951: }
954: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
955: {
956:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
957:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
958:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
959:   MatScalar      *v;
961:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
962:   PetscTruth     usecprow=a->compressedrow.use;

965:   VecGetArray(xx,&x);
966:   VecGetArray(yy,&yarray);
967:   if (zz != yy) {
968:     VecGetArray(zz,&zarray);
969:   } else {
970:     zarray = yarray;
971:   }

973:   idx = a->j;
974:   v   = a->a;
975:   if (usecprow){
976:     if (zz != yy){
977:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
978:     }
979:     mbs  = a->compressedrow.nrows;
980:     ii   = a->compressedrow.i;
981:     ridx = a->compressedrow.rindex;
982:   } else {
983:     ii  = a->i;
984:     y   = yarray;
985:     z   = zarray;
986:   }

988:   for (i=0; i<mbs; i++) {
989:     n  = ii[1] - ii[0]; ii++;
990:     if (usecprow){
991:       z = zarray + 6*ridx[i];
992:       y = yarray + 6*ridx[i];
993:     }
994:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
995:     for (j=0; j<n; j++) {
996:       xb = x + 6*(*idx++);
997:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
998:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
999:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1000:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1001:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1002:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1003:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1004:       v += 36;
1005:     }
1006:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1007:     if (!usecprow){
1008:       z += 6; y += 6;
1009:     }
1010:   }
1011:   VecRestoreArray(xx,&x);
1012:   VecRestoreArray(yy,&yarray);
1013:   if (zz != yy) {
1014:     VecRestoreArray(zz,&zarray);
1015:   }
1016:   PetscLogFlops(72*a->nz);
1017:   return(0);
1018: }

1022: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1023: {
1024:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1025:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1026:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1027:   MatScalar      *v;
1029:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1030:   PetscTruth     usecprow=a->compressedrow.use;

1033:   VecGetArray(xx,&x);
1034:   VecGetArray(yy,&yarray);
1035:   if (zz != yy) {
1036:     VecGetArray(zz,&zarray);
1037:   } else {
1038:     zarray = yarray;
1039:   }

1041:   idx = a->j;
1042:   v   = a->a;
1043:   if (usecprow){
1044:     if (zz != yy){
1045:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1046:     }
1047:     mbs  = a->compressedrow.nrows;
1048:     ii   = a->compressedrow.i;
1049:     ridx = a->compressedrow.rindex;
1050:   } else {
1051:     ii  = a->i;
1052:     y   = yarray;
1053:     z   = zarray;
1054:   }

1056:   for (i=0; i<mbs; i++) {
1057:     n  = ii[1] - ii[0]; ii++;
1058:     if (usecprow){
1059:       z = zarray + 7*ridx[i];
1060:       y = yarray + 7*ridx[i];
1061:     }
1062:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1063:     for (j=0; j<n; j++) {
1064:       xb = x + 7*(*idx++);
1065:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1066:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1067:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1068:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1069:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1070:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1071:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1072:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1073:       v += 49;
1074:     }
1075:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1076:     if (!usecprow){
1077:       z += 7; y += 7;
1078:     }
1079:   }
1080:   VecRestoreArray(xx,&x);
1081:   VecRestoreArray(yy,&yarray);
1082:   if (zz != yy) {
1083:     VecRestoreArray(zz,&zarray);
1084:   }
1085:   PetscLogFlops(98*a->nz);
1086:   return(0);
1087: }

1091: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1092: {
1093:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1094:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*y,*zarray;
1095:   MatScalar      *v;
1097:   PetscInt       mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
1098:   PetscInt       ncols,k,*ridx=PETSC_NULL;
1099:   PetscTruth     usecprow=a->compressedrow.use;

1102:   VecGetArray(xx,&x);
1103:   VecGetArray(zz,&zarray);
1104:   if (zz != yy) {
1105:     VecGetArray(yy,&y);
1106:     PetscMemcpy(zarray,y,yy->n*sizeof(PetscScalar));
1107:     VecRestoreArray(yy,&y);
1108:   }

1110:   idx = a->j;
1111:   v   = a->a;
1112:   if (usecprow){
1113:     mbs    = a->compressedrow.nrows;
1114:     ii     = a->compressedrow.i;
1115:     ridx = a->compressedrow.rindex;
1116:   } else {
1117:     mbs = a->mbs;
1118:     ii  = a->i;
1119:     z   = zarray;
1120:   }

1122:   if (!a->mult_work) {
1123:     k    = PetscMax(A->m,A->n);
1124:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1125:   }
1126:   work = a->mult_work;
1127:   for (i=0; i<mbs; i++) {
1128:     n     = ii[1] - ii[0]; ii++;
1129:     ncols = n*bs;
1130:     workt = work;
1131:     for (j=0; j<n; j++) {
1132:       xb = x + bs*(*idx++);
1133:       for (k=0; k<bs; k++) workt[k] = xb[k];
1134:       workt += bs;
1135:     }
1136:     if (usecprow) z = zarray + bs*ridx[i];
1137:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1138:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1139:     v += n*bs2;
1140:     if (!usecprow){
1141:       z += bs;
1142:     }
1143:   }
1144:   VecRestoreArray(xx,&x);
1145:   VecRestoreArray(zz,&zarray);
1146:   PetscLogFlops(2*a->nz*bs2);
1147:   return(0);
1148: }

1152: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1153: {
1154:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1155:   PetscScalar    zero = 0.0;

1159:   VecSet(zz,zero);
1160:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);

1162:   PetscLogFlops(2*a->nz*a->bs2 - A->n);
1163:   return(0);
1164: }

1168: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1170: {
1171:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1172:   PetscScalar    *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1173:   MatScalar      *v;
1175:   PetscInt       mbs,i,*idx,*ii,rval,bs=A->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1176:   Mat_CompressedRow cprow = a->compressedrow;
1177:   PetscTruth        usecprow=cprow.use;

1180:   if (yy != zz) { VecCopy(yy,zz); }
1181:   VecGetArray(xx,&x);
1182:   VecGetArray(zz,&z);

1184:   idx = a->j;
1185:   v   = a->a;
1186:   if (usecprow){
1187:     mbs  = cprow.nrows;
1188:     ii   = cprow.i;
1189:     ridx = cprow.rindex;
1190:   } else {
1191:     mbs=a->mbs;
1192:     ii = a->i;
1193:     xb = x;
1194:   }

1196:   switch (bs) {
1197:   case 1:
1198:     for (i=0; i<mbs; i++) {
1199:       if (usecprow) xb = x + ridx[i];
1200:       x1 = xb[0];
1201:       ib = idx + ii[0];
1202:       n  = ii[1] - ii[0]; ii++;
1203:       for (j=0; j<n; j++) {
1204:         rval    = ib[j];
1205:         z[rval] += *v * x1;
1206:         v++;
1207:       }
1208:       if (!usecprow) xb++;
1209:     }
1210:     break;
1211:   case 2:
1212:     for (i=0; i<mbs; i++) {
1213:       if (usecprow) xb = x + 2*ridx[i];
1214:       x1 = xb[0]; x2 = xb[1];
1215:       ib = idx + ii[0];
1216:       n  = ii[1] - ii[0]; ii++;
1217:       for (j=0; j<n; j++) {
1218:         rval      = ib[j]*2;
1219:         z[rval++] += v[0]*x1 + v[1]*x2;
1220:         z[rval++] += v[2]*x1 + v[3]*x2;
1221:         v  += 4;
1222:       }
1223:       if (!usecprow) xb += 2;
1224:     }
1225:     break;
1226:   case 3:
1227:     for (i=0; i<mbs; i++) {
1228:       if (usecprow) xb = x + 3*ridx[i];
1229:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1230:       ib = idx + ii[0];
1231:       n  = ii[1] - ii[0]; ii++;
1232:       for (j=0; j<n; j++) {
1233:         rval      = ib[j]*3;
1234:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1235:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1236:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1237:         v  += 9;
1238:       }
1239:       if (!usecprow) xb += 3;
1240:     }
1241:     break;
1242:   case 4:
1243:     for (i=0; i<mbs; i++) {
1244:       if (usecprow) xb = x + 4*ridx[i];
1245:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1246:       ib = idx + ii[0];
1247:       n  = ii[1] - ii[0]; ii++;
1248:       for (j=0; j<n; j++) {
1249:         rval      = ib[j]*4;
1250:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1251:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1252:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1253:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1254:         v  += 16;
1255:       }
1256:       if (!usecprow) xb += 4;
1257:     }
1258:     break;
1259:   case 5:
1260:     for (i=0; i<mbs; i++) {
1261:       if (usecprow) xb = x + 5*ridx[i];
1262:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1263:       x4 = xb[3]; x5 = xb[4];
1264:       ib = idx + ii[0];
1265:       n  = ii[1] - ii[0]; ii++;
1266:       for (j=0; j<n; j++) {
1267:         rval      = ib[j]*5;
1268:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1269:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1270:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1271:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1272:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1273:         v  += 25;
1274:       }
1275:       if (!usecprow) xb += 5;
1276:     }
1277:     break;
1278:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1279:       PetscInt     ncols,k;
1280:       PetscScalar  *work,*workt,*xtmp;

1282:       if (!a->mult_work) {
1283:         k = PetscMax(A->m,A->n);
1284:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1285:       }
1286:       work = a->mult_work;
1287:       xtmp = x;
1288:       for (i=0; i<mbs; i++) {
1289:         n     = ii[1] - ii[0]; ii++;
1290:         ncols = n*bs;
1291:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1292:         if (usecprow) {
1293:           xtmp = x + bs*ridx[i];
1294:         }
1295:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1296:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1297:         v += n*bs2;
1298:         if (!usecprow) xtmp += bs;
1299:         workt = work;
1300:         for (j=0; j<n; j++) {
1301:           zb = z + bs*(*idx++);
1302:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1303:           workt += bs;
1304:         }
1305:       }
1306:     }
1307:   }
1308:   VecRestoreArray(xx,&x);
1309:   VecRestoreArray(zz,&z);
1310:   PetscLogFlops(2*a->nz*a->bs2);
1311:   return(0);
1312: }

1316: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1317: {
1318:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)inA->data;
1319:   PetscInt     totalnz = a->bs2*a->nz;
1320:   PetscScalar  oalpha = alpha;
1322: #if defined(PETSC_USE_MAT_SINGLE)
1323:   PetscInt     i;
1324: #else
1325:   PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1326: #endif

1329: #if defined(PETSC_USE_MAT_SINGLE)
1330:   for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1331: #else
1332:   BLASscal_(&tnz,&oalpha,a->a,&one);
1333: #endif
1334:   PetscLogFlops(totalnz);
1335:   return(0);
1336: }

1340: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1341: {
1343:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1344:   MatScalar      *v = a->a;
1345:   PetscReal      sum = 0.0;
1346:   PetscInt       i,j,k,bs=A->bs,nz=a->nz,bs2=a->bs2,k1;

1349:   if (type == NORM_FROBENIUS) {
1350:     for (i=0; i< bs2*nz; i++) {
1351: #if defined(PETSC_USE_COMPLEX)
1352:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1353: #else
1354:       sum += (*v)*(*v); v++;
1355: #endif
1356:     }
1357:     *norm = sqrt(sum);
1358:   } else if (type == NORM_1) { /* maximum column sum */
1359:     PetscReal *tmp;
1360:     PetscInt  *bcol = a->j;
1361:     PetscMalloc((A->n+1)*sizeof(PetscReal),&tmp);
1362:     PetscMemzero(tmp,A->n*sizeof(PetscReal));
1363:     for (i=0; i<nz; i++){
1364:       for (j=0; j<bs; j++){
1365:         k1 = bs*(*bcol) + j; /* column index */
1366:         for (k=0; k<bs; k++){
1367:           tmp[k1] += PetscAbsScalar(*v); v++;
1368:         }
1369:       }
1370:       bcol++;
1371:     }
1372:     *norm = 0.0;
1373:     for (j=0; j<A->n; j++) {
1374:       if (tmp[j] > *norm) *norm = tmp[j];
1375:     }
1376:     PetscFree(tmp);
1377:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1378:     *norm = 0.0;
1379:     for (k=0; k<bs; k++) {
1380:       for (j=0; j<a->mbs; j++) {
1381:         v = a->a + bs2*a->i[j] + k;
1382:         sum = 0.0;
1383:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1384:           for (k1=0; k1<bs; k1++){
1385:             sum += PetscAbsScalar(*v);
1386:             v   += bs;
1387:           }
1388:         }
1389:         if (sum > *norm) *norm = sum;
1390:       }
1391:     }
1392:   } else {
1393:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1394:   }
1395:   return(0);
1396: }


1401: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1402: {
1403:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1407:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1408:   if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1409:     *flg = PETSC_FALSE;
1410:     return(0);
1411:   }
1412: 
1413:   /* if the a->i are the same */
1414:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1415:   if (!*flg) {
1416:     return(0);
1417:   }
1418: 
1419:   /* if a->j are the same */
1420:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1421:   if (!*flg) {
1422:     return(0);
1423:   }
1424:   /* if a->a are the same */
1425:   PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(B->bs)*sizeof(PetscScalar),flg);
1426:   return(0);
1427: 
1428: }

1432: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1433: {
1434:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1436:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1437:   PetscScalar    *x,zero = 0.0;
1438:   MatScalar      *aa,*aa_j;

1441:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1442:   bs   = A->bs;
1443:   aa   = a->a;
1444:   ai   = a->i;
1445:   aj   = a->j;
1446:   ambs = a->mbs;
1447:   bs2  = a->bs2;

1449:   VecSet(v,zero);
1450:   VecGetArray(v,&x);
1451:   VecGetLocalSize(v,&n);
1452:   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1453:   for (i=0; i<ambs; i++) {
1454:     for (j=ai[i]; j<ai[i+1]; j++) {
1455:       if (aj[j] == i) {
1456:         row  = i*bs;
1457:         aa_j = aa+j*bs2;
1458:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1459:         break;
1460:       }
1461:     }
1462:   }
1463:   VecRestoreArray(v,&x);
1464:   return(0);
1465: }

1469: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1470: {
1471:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1472:   PetscScalar    *l,*r,x,*li,*ri;
1473:   MatScalar      *aa,*v;
1475:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1478:   ai  = a->i;
1479:   aj  = a->j;
1480:   aa  = a->a;
1481:   m   = A->m;
1482:   n   = A->n;
1483:   bs  = A->bs;
1484:   mbs = a->mbs;
1485:   bs2 = a->bs2;
1486:   if (ll) {
1487:     VecGetArray(ll,&l);
1488:     VecGetLocalSize(ll,&lm);
1489:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1490:     for (i=0; i<mbs; i++) { /* for each block row */
1491:       M  = ai[i+1] - ai[i];
1492:       li = l + i*bs;
1493:       v  = aa + bs2*ai[i];
1494:       for (j=0; j<M; j++) { /* for each block */
1495:         for (k=0; k<bs2; k++) {
1496:           (*v++) *= li[k%bs];
1497:         }
1498:       }
1499:     }
1500:     VecRestoreArray(ll,&l);
1501:     PetscLogFlops(a->nz);
1502:   }
1503: 
1504:   if (rr) {
1505:     VecGetArray(rr,&r);
1506:     VecGetLocalSize(rr,&rn);
1507:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1508:     for (i=0; i<mbs; i++) { /* for each block row */
1509:       M  = ai[i+1] - ai[i];
1510:       v  = aa + bs2*ai[i];
1511:       for (j=0; j<M; j++) { /* for each block */
1512:         ri = r + bs*aj[ai[i]+j];
1513:         for (k=0; k<bs; k++) {
1514:           x = ri[k];
1515:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1516:         }
1517:       }
1518:     }
1519:     VecRestoreArray(rr,&r);
1520:     PetscLogFlops(a->nz);
1521:   }
1522:   return(0);
1523: }


1528: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1529: {
1530:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1533:   info->rows_global    = (double)A->m;
1534:   info->columns_global = (double)A->n;
1535:   info->rows_local     = (double)A->m;
1536:   info->columns_local  = (double)A->n;
1537:   info->block_size     = a->bs2;
1538:   info->nz_allocated   = a->maxnz;
1539:   info->nz_used        = a->bs2*a->nz;
1540:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1541:   info->assemblies   = A->num_ass;
1542:   info->mallocs      = a->reallocs;
1543:   info->memory       = A->mem;
1544:   if (A->factor) {
1545:     info->fill_ratio_given  = A->info.fill_ratio_given;
1546:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1547:     info->factor_mallocs    = A->info.factor_mallocs;
1548:   } else {
1549:     info->fill_ratio_given  = 0;
1550:     info->fill_ratio_needed = 0;
1551:     info->factor_mallocs    = 0;
1552:   }
1553:   return(0);
1554: }


1559: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1560: {
1561:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1565:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1566:   return(0);
1567: }