Actual source code: baij2.c
1: #define PETSCMAT_DLL
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/inline/spops.h
5: #include src/inline/ilu.h
6: #include petscbt.h
10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: PetscInt row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->bs;
24: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,table);
27: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
28: PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
34:
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n ; ++j){
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(is[i]);
47:
48: k = 0;
49: for (j=0; j<ov; j++){ /* for each overlap*/
50: n = isz;
51: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end ; l++){
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++)
64: nidx2[j*bs+k] = nidx[j]*bs+k;
65: }
66: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
67: }
68: PetscBTDestroy(table);
69: PetscFree(nidx);
70: PetscFree(nidx2);
71: return(0);
72: }
76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: PetscInt *irow,*icol,nrows,ncols,*ssmap,bs=A->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscTruth flag;
89: ISSorted(iscol,(PetscTruth*)&i);
90: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92: ISGetIndices(isrow,&irow);
93: ISGetIndices(iscol,&icol);
94: ISGetLocalSize(isrow,&nrows);
95: ISGetLocalSize(iscol,&ncols);
97: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
98: ssmap = smap;
99: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100: PetscMemzero(smap,oldcols*sizeof(PetscInt));
101: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102: /* determine lens of each row */
103: for (i=0; i<nrows; i++) {
104: kstart = ai[irow[i]];
105: kend = kstart + a->ilen[irow[i]];
106: lens[i] = 0;
107: for (k=kstart; k<kend; k++) {
108: if (ssmap[aj[k]]) {
109: lens[i]++;
110: }
111: }
112: }
113: /* Create and fill new matrix */
114: if (scall == MAT_REUSE_MATRIX) {
115: c = (Mat_SeqBAIJ *)((*B)->data);
117: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119: if (!flag) {
120: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121: }
122: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123: C = *B;
124: } else {
125: MatCreate(A->comm,&C);
126: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127: MatSetType(C,A->type_name);
128: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129: }
130: c = (Mat_SeqBAIJ *)(C->data);
131: for (i=0; i<nrows; i++) {
132: row = irow[i];
133: kstart = ai[row];
134: kend = kstart + a->ilen[row];
135: mat_i = c->i[i];
136: mat_j = c->j + mat_i;
137: mat_a = c->a + mat_i*bs2;
138: mat_ilen = c->ilen + i;
139: for (k=kstart; k<kend; k++) {
140: if ((tcol=ssmap[a->j[k]])) {
141: *mat_j++ = tcol - 1;
142: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143: mat_a += bs2;
144: (*mat_ilen)++;
145: }
146: }
147: }
148:
149: /* Free work space */
150: ISRestoreIndices(iscol,&icol);
151: PetscFree(smap);
152: PetscFree(lens);
153: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155:
156: ISRestoreIndices(isrow,&irow);
157: *B = C;
158: return(0);
159: }
163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
166: IS is1,is2;
168: PetscInt *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->bs,count;
171: ISGetIndices(isrow,&irow);
172: ISGetIndices(iscol,&icol);
173: ISGetLocalSize(isrow,&nrows);
174: ISGetLocalSize(iscol,&ncols);
175:
176: /* Verify if the indices corespond to each element in a block
177: and form the IS with compressed IS */
178: PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179: iary = vary + a->mbs;
180: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182: count = 0;
183: for (i=0; i<a->mbs; i++) {
184: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185: if (vary[i]==bs) iary[count++] = i;
186: }
187: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188:
189: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191: count = 0;
192: for (i=0; i<a->mbs; i++) {
193: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194: if (vary[i]==bs) iary[count++] = i;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197: ISRestoreIndices(isrow,&irow);
198: ISRestoreIndices(iscol,&icol);
199: PetscFree(vary);
201: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202: ISDestroy(is1);
203: ISDestroy(is2);
204: return(0);
205: }
209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212: PetscInt i;
215: if (scall == MAT_INITIAL_MATRIX) {
216: PetscMalloc((n+1)*sizeof(Mat),B);
217: }
219: for (i=0; i<n; i++) {
220: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221: }
222: return(0);
223: }
226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229: #include petscblaslapack.h
233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
236: PetscScalar *x,*z,sum;
237: MatScalar *v;
239: PetscInt mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
240: PetscTruth usecprow=a->compressedrow.use;
243: VecGetArray(xx,&x);
244: VecGetArray(zz,&z);
246: idx = a->j;
247: v = a->a;
248: if (usecprow){
249: mbs = a->compressedrow.nrows;
250: ii = a->compressedrow.i;
251: ridx = a->compressedrow.rindex;
252: } else {
253: mbs = a->mbs;
254: ii = a->i;
255: }
257: for (i=0; i<mbs; i++) {
258: n = ii[1] - ii[0]; ii++;
259: sum = 0.0;
260: while (n--) sum += *v++ * x[*idx++];
261: if (usecprow){
262: z[ridx[i]] = sum;
263: } else {
264: z[i] = sum;
265: }
266: }
267: VecRestoreArray(xx,&x);
268: VecRestoreArray(zz,&z);
269: PetscLogFlops(2*a->nz - mbs);
270: return(0);
271: }
275: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
276: {
277: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
278: PetscScalar *x,*z = 0,*xb,sum1,sum2,*zarray;
279: PetscScalar x1,x2;
280: MatScalar *v;
282: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
283: PetscTruth usecprow=a->compressedrow.use;
286: VecGetArray(xx,&x);
287: VecGetArray(zz,&zarray);
289: idx = a->j;
290: v = a->a;
291: if (usecprow){
292: mbs = a->compressedrow.nrows;
293: ii = a->compressedrow.i;
294: ridx = a->compressedrow.rindex;
295: } else {
296: mbs = a->mbs;
297: ii = a->i;
298: z = zarray;
299: }
301: for (i=0; i<mbs; i++) {
302: n = ii[1] - ii[0]; ii++;
303: sum1 = 0.0; sum2 = 0.0;
304: for (j=0; j<n; j++) {
305: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
306: sum1 += v[0]*x1 + v[2]*x2;
307: sum2 += v[1]*x1 + v[3]*x2;
308: v += 4;
309: }
310: if (usecprow) z = zarray + 2*ridx[i];
311: z[0] = sum1; z[1] = sum2;
312: if (!usecprow) z += 2;
313: }
314: VecRestoreArray(xx,&x);
315: VecRestoreArray(zz,&zarray);
316: PetscLogFlops(8*a->nz - 2*mbs);
317: return(0);
318: }
322: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
323: {
324: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
325: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*zarray;
326: MatScalar *v;
328: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
329: PetscTruth usecprow=a->compressedrow.use;
330:
332: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
333: #pragma disjoint(*v,*z,*xb)
334: #endif
337: VecGetArray(xx,&x);
338: VecGetArray(zz,&zarray);
340: idx = a->j;
341: v = a->a;
342: if (usecprow){
343: mbs = a->compressedrow.nrows;
344: ii = a->compressedrow.i;
345: ridx = a->compressedrow.rindex;
346: } else {
347: mbs = a->mbs;
348: ii = a->i;
349: z = zarray;
350: }
352: for (i=0; i<mbs; i++) {
353: n = ii[1] - ii[0]; ii++;
354: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
355: for (j=0; j<n; j++) {
356: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
357: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
358: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
359: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
360: v += 9;
361: }
362: if (usecprow) z = zarray + 3*ridx[i];
363: z[0] = sum1; z[1] = sum2; z[2] = sum3;
364: if (!usecprow) z += 3;
365: }
366: VecRestoreArray(xx,&x);
367: VecRestoreArray(zz,&zarray);
368: PetscLogFlops(18*a->nz - 3*mbs);
369: return(0);
370: }
374: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
375: {
376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
377: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
378: MatScalar *v;
380: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
381: PetscTruth usecprow=a->compressedrow.use;
384: VecGetArray(xx,&x);
385: VecGetArray(zz,&zarray);
387: idx = a->j;
388: v = a->a;
389: if (usecprow){
390: mbs = a->compressedrow.nrows;
391: ii = a->compressedrow.i;
392: ridx = a->compressedrow.rindex;
393: } else {
394: mbs = a->mbs;
395: ii = a->i;
396: z = zarray;
397: }
399: for (i=0; i<mbs; i++) {
400: n = ii[1] - ii[0]; ii++;
401: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
402: for (j=0; j<n; j++) {
403: xb = x + 4*(*idx++);
404: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
405: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
406: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
407: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
408: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
409: v += 16;
410: }
411: if (usecprow) z = zarray + 4*ridx[i];
412: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
413: if (!usecprow) z += 4;
414: }
415: VecRestoreArray(xx,&x);
416: VecRestoreArray(zz,&zarray);
417: PetscLogFlops(32*a->nz - 4*mbs);
418: return(0);
419: }
423: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
424: {
425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
426: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z = 0,*x,*zarray;
427: MatScalar *v;
429: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
430: PetscTruth usecprow=a->compressedrow.use;
433: VecGetArray(xx,&x);
434: VecGetArray(zz,&zarray);
436: idx = a->j;
437: v = a->a;
438: if (usecprow){
439: mbs = a->compressedrow.nrows;
440: ii = a->compressedrow.i;
441: ridx = a->compressedrow.rindex;
442: } else {
443: mbs = a->mbs;
444: ii = a->i;
445: z = zarray;
446: }
448: for (i=0; i<mbs; i++) {
449: n = ii[1] - ii[0]; ii++;
450: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
451: for (j=0; j<n; j++) {
452: xb = x + 5*(*idx++);
453: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
454: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
455: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
456: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
457: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
458: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
459: v += 25;
460: }
461: if (usecprow) z = zarray + 5*ridx[i];
462: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
463: if (!usecprow) z += 5;
464: }
465: VecRestoreArray(xx,&x);
466: VecRestoreArray(zz,&zarray);
467: PetscLogFlops(50*a->nz - 5*mbs);
468: return(0);
469: }
474: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
475: {
476: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
477: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
478: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
479: MatScalar *v;
481: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
482: PetscTruth usecprow=a->compressedrow.use;
485: VecGetArray(xx,&x);
486: VecGetArray(zz,&zarray);
488: idx = a->j;
489: v = a->a;
490: if (usecprow){
491: mbs = a->compressedrow.nrows;
492: ii = a->compressedrow.i;
493: ridx = a->compressedrow.rindex;
494: } else {
495: mbs = a->mbs;
496: ii = a->i;
497: z = zarray;
498: }
500: for (i=0; i<mbs; i++) {
501: n = ii[1] - ii[0]; ii++;
502: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
503: for (j=0; j<n; j++) {
504: xb = x + 6*(*idx++);
505: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
506: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
507: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
508: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
509: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
510: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
511: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
512: v += 36;
513: }
514: if (usecprow) z = zarray + 6*ridx[i];
515: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
516: if (!usecprow) z += 6;
517: }
519: VecRestoreArray(xx,&x);
520: VecRestoreArray(zz,&zarray);
521: PetscLogFlops(72*a->nz - 6*mbs);
522: return(0);
523: }
526: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
527: {
528: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
529: PetscScalar *x,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
530: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
531: MatScalar *v;
533: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
534: PetscTruth usecprow=a->compressedrow.use;
537: VecGetArray(xx,&x);
538: VecGetArray(zz,&zarray);
540: idx = a->j;
541: v = a->a;
542: if (usecprow){
543: mbs = a->compressedrow.nrows;
544: ii = a->compressedrow.i;
545: ridx = a->compressedrow.rindex;
546: } else {
547: mbs = a->mbs;
548: ii = a->i;
549: z = zarray;
550: }
552: for (i=0; i<mbs; i++) {
553: n = ii[1] - ii[0]; ii++;
554: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
555: for (j=0; j<n; j++) {
556: xb = x + 7*(*idx++);
557: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
558: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
559: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
560: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
561: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
562: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
563: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
564: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
565: v += 49;
566: }
567: if (usecprow) z = zarray + 7*ridx[i];
568: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
569: if (!usecprow) z += 7;
570: }
572: VecRestoreArray(xx,&x);
573: VecRestoreArray(zz,&zarray);
574: PetscLogFlops(98*a->nz - 7*mbs);
575: return(0);
576: }
578: /*
579: This will not work with MatScalar == float because it calls the BLAS
580: */
583: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
584: {
585: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
586: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
587: MatScalar *v;
589: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
590: PetscInt ncols,k,*ridx=PETSC_NULL;
591: PetscTruth usecprow=a->compressedrow.use;
594: VecGetArray(xx,&x);
595: VecGetArray(zz,&zarray);
597: idx = a->j;
598: v = a->a;
599: if (usecprow){
600: mbs = a->compressedrow.nrows;
601: ii = a->compressedrow.i;
602: ridx = a->compressedrow.rindex;
603: } else {
604: mbs = a->mbs;
605: ii = a->i;
606: z = zarray;
607: }
609: if (!a->mult_work) {
610: k = PetscMax(A->m,A->n);
611: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
612: }
613: work = a->mult_work;
614: for (i=0; i<mbs; i++) {
615: n = ii[1] - ii[0]; ii++;
616: ncols = n*bs;
617: workt = work;
618: for (j=0; j<n; j++) {
619: xb = x + bs*(*idx++);
620: for (k=0; k<bs; k++) workt[k] = xb[k];
621: workt += bs;
622: }
623: if (usecprow) z = zarray + bs*ridx[i];
624: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
625: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
626: v += n*bs2;
627: if (!usecprow) z += bs;
628: }
629: VecRestoreArray(xx,&x);
630: VecRestoreArray(zz,&zarray);
631: PetscLogFlops(2*a->nz*bs2 - bs*mbs);
632: return(0);
633: }
637: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
638: {
639: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
640: PetscScalar *x,*y = 0,*z = 0,sum,*yarray,*zarray;
641: MatScalar *v;
643: PetscInt mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
644: PetscTruth usecprow=a->compressedrow.use;
647: VecGetArray(xx,&x);
648: VecGetArray(yy,&yarray);
649: if (zz != yy) {
650: VecGetArray(zz,&zarray);
651: } else {
652: zarray = yarray;
653: }
655: idx = a->j;
656: v = a->a;
657: if (usecprow){
658: if (zz != yy){
659: PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
660: }
661: mbs = a->compressedrow.nrows;
662: ii = a->compressedrow.i;
663: ridx = a->compressedrow.rindex;
664: } else {
665: ii = a->i;
666: y = yarray;
667: z = zarray;
668: }
670: for (i=0; i<mbs; i++) {
671: n = ii[1] - ii[0]; ii++;
672: if (usecprow){
673: z = zarray + ridx[i];
674: y = yarray + ridx[i];
675: }
676: sum = y[0];
677: while (n--) sum += *v++ * x[*idx++];
678: z[0] = sum;
679: if (!usecprow){
680: z++; y++;
681: }
682: }
683: VecRestoreArray(xx,&x);
684: VecRestoreArray(yy,&yarray);
685: if (zz != yy) {
686: VecRestoreArray(zz,&zarray);
687: }
688: PetscLogFlops(2*a->nz);
689: return(0);
690: }
694: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
695: {
696: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
697: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
698: PetscScalar x1,x2,*yarray,*zarray;
699: MatScalar *v;
701: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
702: PetscTruth usecprow=a->compressedrow.use;
705: VecGetArray(xx,&x);
706: VecGetArray(yy,&yarray);
707: if (zz != yy) {
708: VecGetArray(zz,&zarray);
709: } else {
710: zarray = yarray;
711: }
713: idx = a->j;
714: v = a->a;
715: if (usecprow){
716: if (zz != yy){
717: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
718: }
719: mbs = a->compressedrow.nrows;
720: ii = a->compressedrow.i;
721: ridx = a->compressedrow.rindex;
722: if (zz != yy){
723: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
724: }
725: } else {
726: ii = a->i;
727: y = yarray;
728: z = zarray;
729: }
731: for (i=0; i<mbs; i++) {
732: n = ii[1] - ii[0]; ii++;
733: if (usecprow){
734: z = zarray + 2*ridx[i];
735: y = yarray + 2*ridx[i];
736: }
737: sum1 = y[0]; sum2 = y[1];
738: for (j=0; j<n; j++) {
739: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
740: sum1 += v[0]*x1 + v[2]*x2;
741: sum2 += v[1]*x1 + v[3]*x2;
742: v += 4;
743: }
744: z[0] = sum1; z[1] = sum2;
745: if (!usecprow){
746: z += 2; y += 2;
747: }
748: }
749: VecRestoreArray(xx,&x);
750: VecRestoreArray(yy,&yarray);
751: if (zz != yy) {
752: VecRestoreArray(zz,&zarray);
753: }
754: PetscLogFlops(4*a->nz);
755: return(0);
756: }
760: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
761: {
762: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
763: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
764: MatScalar *v;
766: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
767: PetscTruth usecprow=a->compressedrow.use;
770: VecGetArray(xx,&x);
771: VecGetArray(yy,&yarray);
772: if (zz != yy) {
773: VecGetArray(zz,&zarray);
774: } else {
775: zarray = yarray;
776: }
778: idx = a->j;
779: v = a->a;
780: if (usecprow){
781: if (zz != yy){
782: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
783: }
784: mbs = a->compressedrow.nrows;
785: ii = a->compressedrow.i;
786: ridx = a->compressedrow.rindex;
787: } else {
788: ii = a->i;
789: y = yarray;
790: z = zarray;
791: }
793: for (i=0; i<mbs; i++) {
794: n = ii[1] - ii[0]; ii++;
795: if (usecprow){
796: z = zarray + 3*ridx[i];
797: y = yarray + 3*ridx[i];
798: }
799: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
800: for (j=0; j<n; j++) {
801: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
802: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
803: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
804: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
805: v += 9;
806: }
807: z[0] = sum1; z[1] = sum2; z[2] = sum3;
808: if (!usecprow){
809: z += 3; y += 3;
810: }
811: }
812: VecRestoreArray(xx,&x);
813: VecRestoreArray(yy,&yarray);
814: if (zz != yy) {
815: VecRestoreArray(zz,&zarray);
816: }
817: PetscLogFlops(18*a->nz);
818: return(0);
819: }
823: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
824: {
825: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
826: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
827: MatScalar *v;
829: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
830: PetscTruth usecprow=a->compressedrow.use;
833: VecGetArray(xx,&x);
834: VecGetArray(yy,&yarray);
835: if (zz != yy) {
836: VecGetArray(zz,&zarray);
837: } else {
838: zarray = yarray;
839: }
841: idx = a->j;
842: v = a->a;
843: if (usecprow){
844: if (zz != yy){
845: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
846: }
847: mbs = a->compressedrow.nrows;
848: ii = a->compressedrow.i;
849: ridx = a->compressedrow.rindex;
850: } else {
851: ii = a->i;
852: y = yarray;
853: z = zarray;
854: }
856: for (i=0; i<mbs; i++) {
857: n = ii[1] - ii[0]; ii++;
858: if (usecprow){
859: z = zarray + 4*ridx[i];
860: y = yarray + 4*ridx[i];
861: }
862: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
863: for (j=0; j<n; j++) {
864: xb = x + 4*(*idx++);
865: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
866: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
867: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
868: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
869: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
870: v += 16;
871: }
872: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
873: if (!usecprow){
874: z += 4; y += 4;
875: }
876: }
877: VecRestoreArray(xx,&x);
878: VecRestoreArray(yy,&yarray);
879: if (zz != yy) {
880: VecRestoreArray(zz,&zarray);
881: }
882: PetscLogFlops(32*a->nz);
883: return(0);
884: }
888: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
889: {
890: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
891: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
892: PetscScalar *yarray,*zarray;
893: MatScalar *v;
895: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
896: PetscTruth usecprow=a->compressedrow.use;
899: VecGetArray(xx,&x);
900: VecGetArray(yy,&yarray);
901: if (zz != yy) {
902: VecGetArray(zz,&zarray);
903: } else {
904: zarray = yarray;
905: }
907: idx = a->j;
908: v = a->a;
909: if (usecprow){
910: if (zz != yy){
911: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
912: }
913: mbs = a->compressedrow.nrows;
914: ii = a->compressedrow.i;
915: ridx = a->compressedrow.rindex;
916: } else {
917: ii = a->i;
918: y = yarray;
919: z = zarray;
920: }
922: for (i=0; i<mbs; i++) {
923: n = ii[1] - ii[0]; ii++;
924: if (usecprow){
925: z = zarray + 5*ridx[i];
926: y = yarray + 5*ridx[i];
927: }
928: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
929: for (j=0; j<n; j++) {
930: xb = x + 5*(*idx++);
931: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
932: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
933: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
934: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
935: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
936: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
937: v += 25;
938: }
939: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
940: if (!usecprow){
941: z += 5; y += 5;
942: }
943: }
944: VecRestoreArray(xx,&x);
945: VecRestoreArray(yy,&yarray);
946: if (zz != yy) {
947: VecRestoreArray(zz,&zarray);
948: }
949: PetscLogFlops(50*a->nz);
950: return(0);
951: }
954: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
955: {
956: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
957: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
958: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
959: MatScalar *v;
961: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
962: PetscTruth usecprow=a->compressedrow.use;
965: VecGetArray(xx,&x);
966: VecGetArray(yy,&yarray);
967: if (zz != yy) {
968: VecGetArray(zz,&zarray);
969: } else {
970: zarray = yarray;
971: }
973: idx = a->j;
974: v = a->a;
975: if (usecprow){
976: if (zz != yy){
977: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
978: }
979: mbs = a->compressedrow.nrows;
980: ii = a->compressedrow.i;
981: ridx = a->compressedrow.rindex;
982: } else {
983: ii = a->i;
984: y = yarray;
985: z = zarray;
986: }
988: for (i=0; i<mbs; i++) {
989: n = ii[1] - ii[0]; ii++;
990: if (usecprow){
991: z = zarray + 6*ridx[i];
992: y = yarray + 6*ridx[i];
993: }
994: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
995: for (j=0; j<n; j++) {
996: xb = x + 6*(*idx++);
997: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
998: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
999: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1000: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1001: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1002: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1003: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1004: v += 36;
1005: }
1006: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1007: if (!usecprow){
1008: z += 6; y += 6;
1009: }
1010: }
1011: VecRestoreArray(xx,&x);
1012: VecRestoreArray(yy,&yarray);
1013: if (zz != yy) {
1014: VecRestoreArray(zz,&zarray);
1015: }
1016: PetscLogFlops(72*a->nz);
1017: return(0);
1018: }
1022: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1023: {
1024: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1025: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1026: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1027: MatScalar *v;
1029: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1030: PetscTruth usecprow=a->compressedrow.use;
1033: VecGetArray(xx,&x);
1034: VecGetArray(yy,&yarray);
1035: if (zz != yy) {
1036: VecGetArray(zz,&zarray);
1037: } else {
1038: zarray = yarray;
1039: }
1041: idx = a->j;
1042: v = a->a;
1043: if (usecprow){
1044: if (zz != yy){
1045: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1046: }
1047: mbs = a->compressedrow.nrows;
1048: ii = a->compressedrow.i;
1049: ridx = a->compressedrow.rindex;
1050: } else {
1051: ii = a->i;
1052: y = yarray;
1053: z = zarray;
1054: }
1056: for (i=0; i<mbs; i++) {
1057: n = ii[1] - ii[0]; ii++;
1058: if (usecprow){
1059: z = zarray + 7*ridx[i];
1060: y = yarray + 7*ridx[i];
1061: }
1062: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1063: for (j=0; j<n; j++) {
1064: xb = x + 7*(*idx++);
1065: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1066: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1067: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1068: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1069: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1070: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1071: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1072: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1073: v += 49;
1074: }
1075: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1076: if (!usecprow){
1077: z += 7; y += 7;
1078: }
1079: }
1080: VecRestoreArray(xx,&x);
1081: VecRestoreArray(yy,&yarray);
1082: if (zz != yy) {
1083: VecRestoreArray(zz,&zarray);
1084: }
1085: PetscLogFlops(98*a->nz);
1086: return(0);
1087: }
1091: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1092: {
1093: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1094: PetscScalar *x,*z = 0,*xb,*work,*workt,*y,*zarray;
1095: MatScalar *v;
1097: PetscInt mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
1098: PetscInt ncols,k,*ridx=PETSC_NULL;
1099: PetscTruth usecprow=a->compressedrow.use;
1102: VecGetArray(xx,&x);
1103: VecGetArray(zz,&zarray);
1104: if (zz != yy) {
1105: VecGetArray(yy,&y);
1106: PetscMemcpy(zarray,y,yy->n*sizeof(PetscScalar));
1107: VecRestoreArray(yy,&y);
1108: }
1110: idx = a->j;
1111: v = a->a;
1112: if (usecprow){
1113: mbs = a->compressedrow.nrows;
1114: ii = a->compressedrow.i;
1115: ridx = a->compressedrow.rindex;
1116: } else {
1117: mbs = a->mbs;
1118: ii = a->i;
1119: z = zarray;
1120: }
1122: if (!a->mult_work) {
1123: k = PetscMax(A->m,A->n);
1124: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1125: }
1126: work = a->mult_work;
1127: for (i=0; i<mbs; i++) {
1128: n = ii[1] - ii[0]; ii++;
1129: ncols = n*bs;
1130: workt = work;
1131: for (j=0; j<n; j++) {
1132: xb = x + bs*(*idx++);
1133: for (k=0; k<bs; k++) workt[k] = xb[k];
1134: workt += bs;
1135: }
1136: if (usecprow) z = zarray + bs*ridx[i];
1137: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1138: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1139: v += n*bs2;
1140: if (!usecprow){
1141: z += bs;
1142: }
1143: }
1144: VecRestoreArray(xx,&x);
1145: VecRestoreArray(zz,&zarray);
1146: PetscLogFlops(2*a->nz*bs2);
1147: return(0);
1148: }
1152: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1153: {
1154: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1155: PetscScalar zero = 0.0;
1159: VecSet(zz,zero);
1160: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1162: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1163: return(0);
1164: }
1168: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1170: {
1171: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1172: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1173: MatScalar *v;
1175: PetscInt mbs,i,*idx,*ii,rval,bs=A->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1176: Mat_CompressedRow cprow = a->compressedrow;
1177: PetscTruth usecprow=cprow.use;
1180: if (yy != zz) { VecCopy(yy,zz); }
1181: VecGetArray(xx,&x);
1182: VecGetArray(zz,&z);
1184: idx = a->j;
1185: v = a->a;
1186: if (usecprow){
1187: mbs = cprow.nrows;
1188: ii = cprow.i;
1189: ridx = cprow.rindex;
1190: } else {
1191: mbs=a->mbs;
1192: ii = a->i;
1193: xb = x;
1194: }
1196: switch (bs) {
1197: case 1:
1198: for (i=0; i<mbs; i++) {
1199: if (usecprow) xb = x + ridx[i];
1200: x1 = xb[0];
1201: ib = idx + ii[0];
1202: n = ii[1] - ii[0]; ii++;
1203: for (j=0; j<n; j++) {
1204: rval = ib[j];
1205: z[rval] += *v * x1;
1206: v++;
1207: }
1208: if (!usecprow) xb++;
1209: }
1210: break;
1211: case 2:
1212: for (i=0; i<mbs; i++) {
1213: if (usecprow) xb = x + 2*ridx[i];
1214: x1 = xb[0]; x2 = xb[1];
1215: ib = idx + ii[0];
1216: n = ii[1] - ii[0]; ii++;
1217: for (j=0; j<n; j++) {
1218: rval = ib[j]*2;
1219: z[rval++] += v[0]*x1 + v[1]*x2;
1220: z[rval++] += v[2]*x1 + v[3]*x2;
1221: v += 4;
1222: }
1223: if (!usecprow) xb += 2;
1224: }
1225: break;
1226: case 3:
1227: for (i=0; i<mbs; i++) {
1228: if (usecprow) xb = x + 3*ridx[i];
1229: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1230: ib = idx + ii[0];
1231: n = ii[1] - ii[0]; ii++;
1232: for (j=0; j<n; j++) {
1233: rval = ib[j]*3;
1234: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1235: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1236: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1237: v += 9;
1238: }
1239: if (!usecprow) xb += 3;
1240: }
1241: break;
1242: case 4:
1243: for (i=0; i<mbs; i++) {
1244: if (usecprow) xb = x + 4*ridx[i];
1245: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1246: ib = idx + ii[0];
1247: n = ii[1] - ii[0]; ii++;
1248: for (j=0; j<n; j++) {
1249: rval = ib[j]*4;
1250: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1251: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1252: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1253: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1254: v += 16;
1255: }
1256: if (!usecprow) xb += 4;
1257: }
1258: break;
1259: case 5:
1260: for (i=0; i<mbs; i++) {
1261: if (usecprow) xb = x + 5*ridx[i];
1262: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1263: x4 = xb[3]; x5 = xb[4];
1264: ib = idx + ii[0];
1265: n = ii[1] - ii[0]; ii++;
1266: for (j=0; j<n; j++) {
1267: rval = ib[j]*5;
1268: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1269: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1270: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1271: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1272: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1273: v += 25;
1274: }
1275: if (!usecprow) xb += 5;
1276: }
1277: break;
1278: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1279: PetscInt ncols,k;
1280: PetscScalar *work,*workt,*xtmp;
1282: if (!a->mult_work) {
1283: k = PetscMax(A->m,A->n);
1284: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1285: }
1286: work = a->mult_work;
1287: xtmp = x;
1288: for (i=0; i<mbs; i++) {
1289: n = ii[1] - ii[0]; ii++;
1290: ncols = n*bs;
1291: PetscMemzero(work,ncols*sizeof(PetscScalar));
1292: if (usecprow) {
1293: xtmp = x + bs*ridx[i];
1294: }
1295: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1296: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1297: v += n*bs2;
1298: if (!usecprow) xtmp += bs;
1299: workt = work;
1300: for (j=0; j<n; j++) {
1301: zb = z + bs*(*idx++);
1302: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1303: workt += bs;
1304: }
1305: }
1306: }
1307: }
1308: VecRestoreArray(xx,&x);
1309: VecRestoreArray(zz,&z);
1310: PetscLogFlops(2*a->nz*a->bs2);
1311: return(0);
1312: }
1316: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1317: {
1318: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1319: PetscInt totalnz = a->bs2*a->nz;
1320: PetscScalar oalpha = alpha;
1322: #if defined(PETSC_USE_MAT_SINGLE)
1323: PetscInt i;
1324: #else
1325: PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1326: #endif
1329: #if defined(PETSC_USE_MAT_SINGLE)
1330: for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1331: #else
1332: BLASscal_(&tnz,&oalpha,a->a,&one);
1333: #endif
1334: PetscLogFlops(totalnz);
1335: return(0);
1336: }
1340: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1341: {
1343: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1344: MatScalar *v = a->a;
1345: PetscReal sum = 0.0;
1346: PetscInt i,j,k,bs=A->bs,nz=a->nz,bs2=a->bs2,k1;
1349: if (type == NORM_FROBENIUS) {
1350: for (i=0; i< bs2*nz; i++) {
1351: #if defined(PETSC_USE_COMPLEX)
1352: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1353: #else
1354: sum += (*v)*(*v); v++;
1355: #endif
1356: }
1357: *norm = sqrt(sum);
1358: } else if (type == NORM_1) { /* maximum column sum */
1359: PetscReal *tmp;
1360: PetscInt *bcol = a->j;
1361: PetscMalloc((A->n+1)*sizeof(PetscReal),&tmp);
1362: PetscMemzero(tmp,A->n*sizeof(PetscReal));
1363: for (i=0; i<nz; i++){
1364: for (j=0; j<bs; j++){
1365: k1 = bs*(*bcol) + j; /* column index */
1366: for (k=0; k<bs; k++){
1367: tmp[k1] += PetscAbsScalar(*v); v++;
1368: }
1369: }
1370: bcol++;
1371: }
1372: *norm = 0.0;
1373: for (j=0; j<A->n; j++) {
1374: if (tmp[j] > *norm) *norm = tmp[j];
1375: }
1376: PetscFree(tmp);
1377: } else if (type == NORM_INFINITY) { /* maximum row sum */
1378: *norm = 0.0;
1379: for (k=0; k<bs; k++) {
1380: for (j=0; j<a->mbs; j++) {
1381: v = a->a + bs2*a->i[j] + k;
1382: sum = 0.0;
1383: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1384: for (k1=0; k1<bs; k1++){
1385: sum += PetscAbsScalar(*v);
1386: v += bs;
1387: }
1388: }
1389: if (sum > *norm) *norm = sum;
1390: }
1391: }
1392: } else {
1393: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1394: }
1395: return(0);
1396: }
1401: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1402: {
1403: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1407: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1408: if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1409: *flg = PETSC_FALSE;
1410: return(0);
1411: }
1412:
1413: /* if the a->i are the same */
1414: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1415: if (!*flg) {
1416: return(0);
1417: }
1418:
1419: /* if a->j are the same */
1420: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1421: if (!*flg) {
1422: return(0);
1423: }
1424: /* if a->a are the same */
1425: PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(B->bs)*sizeof(PetscScalar),flg);
1426: return(0);
1427:
1428: }
1432: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1433: {
1434: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1436: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1437: PetscScalar *x,zero = 0.0;
1438: MatScalar *aa,*aa_j;
1441: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1442: bs = A->bs;
1443: aa = a->a;
1444: ai = a->i;
1445: aj = a->j;
1446: ambs = a->mbs;
1447: bs2 = a->bs2;
1449: VecSet(v,zero);
1450: VecGetArray(v,&x);
1451: VecGetLocalSize(v,&n);
1452: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1453: for (i=0; i<ambs; i++) {
1454: for (j=ai[i]; j<ai[i+1]; j++) {
1455: if (aj[j] == i) {
1456: row = i*bs;
1457: aa_j = aa+j*bs2;
1458: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1459: break;
1460: }
1461: }
1462: }
1463: VecRestoreArray(v,&x);
1464: return(0);
1465: }
1469: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1470: {
1471: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1472: PetscScalar *l,*r,x,*li,*ri;
1473: MatScalar *aa,*v;
1475: PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1478: ai = a->i;
1479: aj = a->j;
1480: aa = a->a;
1481: m = A->m;
1482: n = A->n;
1483: bs = A->bs;
1484: mbs = a->mbs;
1485: bs2 = a->bs2;
1486: if (ll) {
1487: VecGetArray(ll,&l);
1488: VecGetLocalSize(ll,&lm);
1489: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1490: for (i=0; i<mbs; i++) { /* for each block row */
1491: M = ai[i+1] - ai[i];
1492: li = l + i*bs;
1493: v = aa + bs2*ai[i];
1494: for (j=0; j<M; j++) { /* for each block */
1495: for (k=0; k<bs2; k++) {
1496: (*v++) *= li[k%bs];
1497: }
1498: }
1499: }
1500: VecRestoreArray(ll,&l);
1501: PetscLogFlops(a->nz);
1502: }
1503:
1504: if (rr) {
1505: VecGetArray(rr,&r);
1506: VecGetLocalSize(rr,&rn);
1507: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1508: for (i=0; i<mbs; i++) { /* for each block row */
1509: M = ai[i+1] - ai[i];
1510: v = aa + bs2*ai[i];
1511: for (j=0; j<M; j++) { /* for each block */
1512: ri = r + bs*aj[ai[i]+j];
1513: for (k=0; k<bs; k++) {
1514: x = ri[k];
1515: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1516: }
1517: }
1518: }
1519: VecRestoreArray(rr,&r);
1520: PetscLogFlops(a->nz);
1521: }
1522: return(0);
1523: }
1528: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1529: {
1530: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1533: info->rows_global = (double)A->m;
1534: info->columns_global = (double)A->n;
1535: info->rows_local = (double)A->m;
1536: info->columns_local = (double)A->n;
1537: info->block_size = a->bs2;
1538: info->nz_allocated = a->maxnz;
1539: info->nz_used = a->bs2*a->nz;
1540: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1541: info->assemblies = A->num_ass;
1542: info->mallocs = a->reallocs;
1543: info->memory = A->mem;
1544: if (A->factor) {
1545: info->fill_ratio_given = A->info.fill_ratio_given;
1546: info->fill_ratio_needed = A->info.fill_ratio_needed;
1547: info->factor_mallocs = A->info.factor_mallocs;
1548: } else {
1549: info->fill_ratio_given = 0;
1550: info->fill_ratio_needed = 0;
1551: info->factor_mallocs = 0;
1552: }
1553: return(0);
1554: }
1559: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1560: {
1561: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1565: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1566: return(0);
1567: }