Actual source code: baij2.c
1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/vec/vecimpl.h
5: #include src/inline/spops.h
6: #include src/inline/ilu.h
7: #include petscbt.h
9: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
12: int row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
13: int start,end,*ai,*aj,bs,*nidx2;
14: PetscBT table;
17: m = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = a->bs;
22: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24: PetscBTCreate(m,table);
25: PetscMalloc((m+1)*sizeof(int),&nidx);
26: PetscMalloc((A->m+1)*sizeof(int),&nidx2);
28: for (i=0; i<is_max; i++) {
29: /* Initialise the two local arrays */
30: isz = 0;
31: PetscBTMemzero(m,table);
32:
33: /* Extract the indices, assume there can be duplicate entries */
34: ISGetIndices(is[i],&idx);
35: ISGetLocalSize(is[i],&n);
37: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38: for (j=0; j<n ; ++j){
39: ival = idx[j]/bs; /* convert the indices into block indices */
40: if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
42: }
43: ISRestoreIndices(is[i],&idx);
44: ISDestroy(is[i]);
45:
46: k = 0;
47: for (j=0; j<ov; j++){ /* for each overlap*/
48: n = isz;
49: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
50: row = nidx[k];
51: start = ai[row];
52: end = ai[row+1];
53: for (l = start; l<end ; l++){
54: val = aj[l];
55: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
56: }
57: }
58: }
59: /* expand the Index Set */
60: for (j=0; j<isz; j++) {
61: for (k=0; k<bs; k++)
62: nidx2[j*bs+k] = nidx[j]*bs+k;
63: }
64: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
65: }
66: PetscBTDestroy(table);
67: PetscFree(nidx);
68: PetscFree(nidx2);
69: return(0);
70: }
72: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
73: {
74: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
75: int *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
76: int row,mat_i,*mat_j,tcol,*mat_ilen;
77: int *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
78: int *aj = a->j,*ai = a->i;
79: MatScalar *mat_a;
80: Mat C;
81: PetscTruth flag;
84: ISSorted(iscol,(PetscTruth*)&i);
85: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
87: ISGetIndices(isrow,&irow);
88: ISGetIndices(iscol,&icol);
89: ISGetLocalSize(isrow,&nrows);
90: ISGetLocalSize(iscol,&ncols);
92: PetscMalloc((1+oldcols)*sizeof(int),&smap);
93: ssmap = smap;
94: PetscMalloc((1+nrows)*sizeof(int),&lens);
95: ierr = PetscMemzero(smap,oldcols*sizeof(int));
96: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
97: /* determine lens of each row */
98: for (i=0; i<nrows; i++) {
99: kstart = ai[irow[i]];
100: kend = kstart + a->ilen[irow[i]];
101: lens[i] = 0;
102: for (k=kstart; k<kend; k++) {
103: if (ssmap[aj[k]]) {
104: lens[i]++;
105: }
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ *)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
114: if (flag == PETSC_FALSE) {
115: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
116: }
117: PetscMemzero(c->ilen,c->mbs*sizeof(int));
118: C = *B;
119: } else {
120: MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
121: }
122: c = (Mat_SeqBAIJ *)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140:
141: /* Free work space */
142: ISRestoreIndices(iscol,&icol);
143: PetscFree(smap);
144: PetscFree(lens);
145: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
146: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
147:
148: ISRestoreIndices(isrow,&irow);
149: *B = C;
150: return(0);
151: }
153: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
154: {
155: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
156: IS is1,is2;
157: int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
160: ISGetIndices(isrow,&irow);
161: ISGetIndices(iscol,&icol);
162: ISGetLocalSize(isrow,&nrows);
163: ISGetLocalSize(iscol,&ncols);
164:
165: /* Verify if the indices corespond to each element in a block
166: and form the IS with compressed IS */
167: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
168: iary = vary + a->mbs;
169: PetscMemzero(vary,(a->mbs)*sizeof(int));
170: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
171: count = 0;
172: for (i=0; i<a->mbs; i++) {
173: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
174: if (vary[i]==bs) iary[count++] = i;
175: }
176: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
177:
178: PetscMemzero(vary,(a->mbs)*sizeof(int));
179: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
180: count = 0;
181: for (i=0; i<a->mbs; i++) {
182: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
183: if (vary[i]==bs) iary[count++] = i;
184: }
185: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
186: ISRestoreIndices(isrow,&irow);
187: ISRestoreIndices(iscol,&icol);
188: PetscFree(vary);
190: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
191: ISDestroy(is1);
192: ISDestroy(is2);
193: return(0);
194: }
196: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
197: {
198: int ierr,i;
201: if (scall == MAT_INITIAL_MATRIX) {
202: PetscMalloc((n+1)*sizeof(Mat),B);
203: }
205: for (i=0; i<n; i++) {
206: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
207: }
208: return(0);
209: }
212: /* -------------------------------------------------------*/
213: /* Should check that shapes of vectors and matrices match */
214: /* -------------------------------------------------------*/
215: #include petscblaslapack.h
217: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
218: {
219: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
220: PetscScalar *x,*z,sum;
221: MatScalar *v;
222: int mbs=a->mbs,i,*idx,*ii,n,ierr;
225: VecGetArray(xx,&x);
226: VecGetArray(zz,&z);
228: idx = a->j;
229: v = a->a;
230: ii = a->i;
232: for (i=0; i<mbs; i++) {
233: n = ii[1] - ii[0]; ii++;
234: sum = 0.0;
235: while (n--) sum += *v++ * x[*idx++];
236: z[i] = sum;
237: }
238: VecRestoreArray(xx,&x);
239: VecRestoreArray(zz,&z);
240: PetscLogFlops(2*a->nz - A->m);
241: return(0);
242: }
244: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
245: {
246: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
247: PetscScalar *x,*z,*xb,sum1,sum2;
248: PetscScalar x1,x2;
249: MatScalar *v;
250: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
253: VecGetArray(xx,&x);
254: VecGetArray(zz,&z);
256: idx = a->j;
257: v = a->a;
258: ii = a->i;
260: for (i=0; i<mbs; i++) {
261: n = ii[1] - ii[0]; ii++;
262: sum1 = 0.0; sum2 = 0.0;
263: for (j=0; j<n; j++) {
264: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
265: sum1 += v[0]*x1 + v[2]*x2;
266: sum2 += v[1]*x1 + v[3]*x2;
267: v += 4;
268: }
269: z[0] = sum1; z[1] = sum2;
270: z += 2;
271: }
272: VecRestoreArray(xx,&x);
273: VecRestoreArray(zz,&z);
274: PetscLogFlops(8*a->nz - A->m);
275: return(0);
276: }
278: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
279: {
280: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
281: PetscScalar *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
282: MatScalar *v;
283: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
285: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
286: #pragma disjoint(*v,*z,*xb)
287: #endif
290: VecGetArray(xx,&x);
291: VecGetArray(zz,&z);
293: idx = a->j;
294: v = a->a;
295: ii = a->i;
297: for (i=0; i<mbs; i++) {
298: n = ii[1] - ii[0]; ii++;
299: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
300: for (j=0; j<n; j++) {
301: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
302: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
303: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
304: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
305: v += 9;
306: }
307: z[0] = sum1; z[1] = sum2; z[2] = sum3;
308: z += 3;
309: }
310: VecRestoreArray(xx,&x);
311: VecRestoreArray(zz,&z);
312: PetscLogFlops(18*a->nz - A->m);
313: return(0);
314: }
316: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
317: {
318: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
319: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
320: MatScalar *v;
321: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
324: VecGetArray(xx,&x);
325: VecGetArray(zz,&z);
327: idx = a->j;
328: v = a->a;
329: ii = a->i;
331: for (i=0; i<mbs; i++) {
332: n = ii[1] - ii[0]; ii++;
333: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
334: for (j=0; j<n; j++) {
335: xb = x + 4*(*idx++);
336: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
337: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
338: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
339: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
340: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
341: v += 16;
342: }
343: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
344: z += 4;
345: }
346: VecRestoreArray(xx,&x);
347: VecRestoreArray(zz,&z);
348: PetscLogFlops(32*a->nz - A->m);
349: return(0);
350: }
352: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
353: {
354: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
355: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
356: MatScalar *v;
357: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
360: VecGetArray(xx,&x);
361: VecGetArray(zz,&z);
363: idx = a->j;
364: v = a->a;
365: ii = a->i;
367: for (i=0; i<mbs; i++) {
368: n = ii[1] - ii[0]; ii++;
369: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
370: for (j=0; j<n; j++) {
371: xb = x + 5*(*idx++);
372: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
373: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
374: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
375: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
376: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
377: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
378: v += 25;
379: }
380: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
381: z += 5;
382: }
383: VecRestoreArray(xx,&x);
384: VecRestoreArray(zz,&z);
385: PetscLogFlops(50*a->nz - A->m);
386: return(0);
387: }
390: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
391: {
392: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
393: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
394: PetscScalar x1,x2,x3,x4,x5,x6;
395: MatScalar *v;
396: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
399: VecGetArray(xx,&x);
400: VecGetArray(zz,&z);
402: idx = a->j;
403: v = a->a;
404: ii = a->i;
406: for (i=0; i<mbs; i++) {
407: n = ii[1] - ii[0]; ii++;
408: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
409: for (j=0; j<n; j++) {
410: xb = x + 6*(*idx++);
411: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
412: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
413: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
414: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
415: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
416: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
417: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
418: v += 36;
419: }
420: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
421: z += 6;
422: }
424: VecRestoreArray(xx,&x);
425: VecRestoreArray(zz,&z);
426: PetscLogFlops(72*a->nz - A->m);
427: return(0);
428: }
429: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
430: {
431: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
432: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
433: PetscScalar x1,x2,x3,x4,x5,x6,x7;
434: MatScalar *v;
435: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
438: VecGetArray(xx,&x);
439: VecGetArray(zz,&z);
441: idx = a->j;
442: v = a->a;
443: ii = a->i;
445: for (i=0; i<mbs; i++) {
446: n = ii[1] - ii[0]; ii++;
447: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
448: for (j=0; j<n; j++) {
449: xb = x + 7*(*idx++);
450: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
451: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
452: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
453: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
454: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
455: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
456: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
457: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
458: v += 49;
459: }
460: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
461: z += 7;
462: }
464: VecRestoreArray(xx,&x);
465: VecRestoreArray(zz,&z);
466: PetscLogFlops(98*a->nz - A->m);
467: return(0);
468: }
470: /*
471: This will not work with MatScalar == float because it calls the BLAS
472: */
473: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
474: {
475: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
476: PetscScalar *x,*z,*xb,*work,*workt;
477: MatScalar *v;
478: int ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
479: int ncols,k;
482: VecGetArray(xx,&x);
483: VecGetArray(zz,&z);
485: idx = a->j;
486: v = a->a;
487: ii = a->i;
490: if (!a->mult_work) {
491: k = PetscMax(A->m,A->n);
492: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
493: }
494: work = a->mult_work;
495: for (i=0; i<mbs; i++) {
496: n = ii[1] - ii[0]; ii++;
497: ncols = n*bs;
498: workt = work;
499: for (j=0; j<n; j++) {
500: xb = x + bs*(*idx++);
501: for (k=0; k<bs; k++) workt[k] = xb[k];
502: workt += bs;
503: }
504: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
505: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
506: v += n*bs2;
507: z += bs;
508: }
509: VecRestoreArray(xx,&x);
510: VecRestoreArray(zz,&z);
511: PetscLogFlops(2*a->nz*bs2 - A->m);
512: return(0);
513: }
515: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
516: {
517: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
518: PetscScalar *x,*y,*z,sum;
519: MatScalar *v;
520: int ierr,mbs=a->mbs,i,*idx,*ii,n;
523: VecGetArray(xx,&x);
524: VecGetArray(yy,&y);
525: if (zz != yy) {
526: VecGetArray(zz,&z);
527: } else {
528: z = y;
529: }
531: idx = a->j;
532: v = a->a;
533: ii = a->i;
535: for (i=0; i<mbs; i++) {
536: n = ii[1] - ii[0]; ii++;
537: sum = y[i];
538: while (n--) sum += *v++ * x[*idx++];
539: z[i] = sum;
540: }
541: VecRestoreArray(xx,&x);
542: VecRestoreArray(yy,&y);
543: if (zz != yy) {
544: VecRestoreArray(zz,&z);
545: }
546: PetscLogFlops(2*a->nz);
547: return(0);
548: }
550: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
551: {
552: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
553: PetscScalar *x,*y,*z,*xb,sum1,sum2;
554: PetscScalar x1,x2;
555: MatScalar *v;
556: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
559: VecGetArray(xx,&x);
560: VecGetArray(yy,&y);
561: if (zz != yy) {
562: VecGetArray(zz,&z);
563: } else {
564: z = y;
565: }
567: idx = a->j;
568: v = a->a;
569: ii = a->i;
571: for (i=0; i<mbs; i++) {
572: n = ii[1] - ii[0]; ii++;
573: sum1 = y[0]; sum2 = y[1];
574: for (j=0; j<n; j++) {
575: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
576: sum1 += v[0]*x1 + v[2]*x2;
577: sum2 += v[1]*x1 + v[3]*x2;
578: v += 4;
579: }
580: z[0] = sum1; z[1] = sum2;
581: z += 2; y += 2;
582: }
583: VecRestoreArray(xx,&x);
584: VecRestoreArray(yy,&y);
585: if (zz != yy) {
586: VecRestoreArray(zz,&z);
587: }
588: PetscLogFlops(4*a->nz);
589: return(0);
590: }
592: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
593: {
594: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
595: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
596: MatScalar *v;
597: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
600: VecGetArray(xx,&x);
601: VecGetArray(yy,&y);
602: if (zz != yy) {
603: VecGetArray(zz,&z);
604: } else {
605: z = y;
606: }
608: idx = a->j;
609: v = a->a;
610: ii = a->i;
612: for (i=0; i<mbs; i++) {
613: n = ii[1] - ii[0]; ii++;
614: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
615: for (j=0; j<n; j++) {
616: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
617: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
618: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
619: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
620: v += 9;
621: }
622: z[0] = sum1; z[1] = sum2; z[2] = sum3;
623: z += 3; y += 3;
624: }
625: VecRestoreArray(xx,&x);
626: VecRestoreArray(yy,&y);
627: if (zz != yy) {
628: VecRestoreArray(zz,&z);
629: }
630: PetscLogFlops(18*a->nz);
631: return(0);
632: }
634: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
635: {
636: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
637: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
638: MatScalar *v;
639: int ierr,mbs=a->mbs,i,*idx,*ii;
640: int j,n;
643: VecGetArray(xx,&x);
644: VecGetArray(yy,&y);
645: if (zz != yy) {
646: VecGetArray(zz,&z);
647: } else {
648: z = y;
649: }
651: idx = a->j;
652: v = a->a;
653: ii = a->i;
655: for (i=0; i<mbs; i++) {
656: n = ii[1] - ii[0]; ii++;
657: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
658: for (j=0; j<n; j++) {
659: xb = x + 4*(*idx++);
660: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
661: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
662: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
663: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
664: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
665: v += 16;
666: }
667: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
668: z += 4; y += 4;
669: }
670: VecRestoreArray(xx,&x);
671: VecRestoreArray(yy,&y);
672: if (zz != yy) {
673: VecRestoreArray(zz,&z);
674: }
675: PetscLogFlops(32*a->nz);
676: return(0);
677: }
679: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
680: {
681: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
682: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
683: MatScalar *v;
684: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
687: VecGetArray(xx,&x);
688: VecGetArray(yy,&y);
689: if (zz != yy) {
690: VecGetArray(zz,&z);
691: } else {
692: z = y;
693: }
695: idx = a->j;
696: v = a->a;
697: ii = a->i;
699: for (i=0; i<mbs; i++) {
700: n = ii[1] - ii[0]; ii++;
701: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
702: for (j=0; j<n; j++) {
703: xb = x + 5*(*idx++);
704: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
705: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
706: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
707: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
708: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
709: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
710: v += 25;
711: }
712: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
713: z += 5; y += 5;
714: }
715: VecRestoreArray(xx,&x);
716: VecRestoreArray(yy,&y);
717: if (zz != yy) {
718: VecRestoreArray(zz,&z);
719: }
720: PetscLogFlops(50*a->nz);
721: return(0);
722: }
723: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
724: {
725: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
726: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
727: PetscScalar x1,x2,x3,x4,x5,x6;
728: MatScalar *v;
729: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
732: VecGetArray(xx,&x);
733: VecGetArray(yy,&y);
734: if (zz != yy) {
735: VecGetArray(zz,&z);
736: } else {
737: z = y;
738: }
740: idx = a->j;
741: v = a->a;
742: ii = a->i;
744: for (i=0; i<mbs; i++) {
745: n = ii[1] - ii[0]; ii++;
746: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
747: for (j=0; j<n; j++) {
748: xb = x + 6*(*idx++);
749: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
750: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
751: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
752: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
753: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
754: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
755: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
756: v += 36;
757: }
758: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
759: z += 6; y += 6;
760: }
761: VecRestoreArray(xx,&x);
762: VecRestoreArray(yy,&y);
763: if (zz != yy) {
764: VecRestoreArray(zz,&z);
765: }
766: PetscLogFlops(72*a->nz);
767: return(0);
768: }
770: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
771: {
772: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
773: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
774: PetscScalar x1,x2,x3,x4,x5,x6,x7;
775: MatScalar *v;
776: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
779: VecGetArray(xx,&x);
780: VecGetArray(yy,&y);
781: if (zz != yy) {
782: VecGetArray(zz,&z);
783: } else {
784: z = y;
785: }
787: idx = a->j;
788: v = a->a;
789: ii = a->i;
791: for (i=0; i<mbs; i++) {
792: n = ii[1] - ii[0]; ii++;
793: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
794: for (j=0; j<n; j++) {
795: xb = x + 7*(*idx++);
796: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
797: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
798: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
799: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
800: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
801: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
802: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
803: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
804: v += 49;
805: }
806: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
807: z += 7; y += 7;
808: }
809: VecRestoreArray(xx,&x);
810: VecRestoreArray(yy,&y);
811: if (zz != yy) {
812: VecRestoreArray(zz,&z);
813: }
814: PetscLogFlops(98*a->nz);
815: return(0);
816: }
818: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
819: {
820: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
821: PetscScalar *x,*z,*xb,*work,*workt,*y;
822: MatScalar *v;
823: int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
824: int ncols,k;
827: VecGetArray(xx,&x);
828: VecGetArray(zz,&z);
829: if (zz != yy) {
830: VecGetArrayFast(yy,&y);
831: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
832: VecRestoreArrayFast(yy,&y);
833: }
835: idx = a->j;
836: v = a->a;
837: ii = a->i;
840: if (!a->mult_work) {
841: k = PetscMax(A->m,A->n);
842: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
843: }
844: work = a->mult_work;
845: for (i=0; i<mbs; i++) {
846: n = ii[1] - ii[0]; ii++;
847: ncols = n*bs;
848: workt = work;
849: for (j=0; j<n; j++) {
850: xb = x + bs*(*idx++);
851: for (k=0; k<bs; k++) workt[k] = xb[k];
852: workt += bs;
853: }
854: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
855: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
856: v += n*bs2;
857: z += bs;
858: }
859: VecRestoreArray(xx,&x);
860: VecRestoreArray(zz,&z);
861: PetscLogFlops(2*a->nz*bs2);
862: return(0);
863: }
865: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
866: {
867: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
868: PetscScalar *xg,*zg,*zb,zero = 0.0;
869: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
870: MatScalar *v;
871: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
872: int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
875: VecSet(&zero,zz);
876: VecGetArray(xx,&xg); x = xg;
877: VecGetArray(zz,&zg); z = zg;
879: idx = a->j;
880: v = a->a;
881: ii = a->i;
882: xb = x;
883: switch (bs) {
884: case 1:
885: for (i=0; i<mbs; i++) {
886: n = ii[1] - ii[0]; ii++;
887: x1 = xb[0];
888: ib = idx + ai[i];
889: for (j=0; j<n; j++) {
890: rval = ib[j];
891: z[rval] += *v * x1;
892: v++;
893: }
894: xb++;
895: }
896: break;
897: case 2:
898: for (i=0; i<mbs; i++) {
899: n = ii[1] - ii[0]; ii++;
900: x1 = xb[0]; x2 = xb[1];
901: ib = idx + ai[i];
902: for (j=0; j<n; j++) {
903: rval = ib[j]*2;
904: z[rval++] += v[0]*x1 + v[1]*x2;
905: z[rval] += v[2]*x1 + v[3]*x2;
906: v += 4;
907: }
908: xb += 2;
909: }
910: break;
911: case 3:
912: for (i=0; i<mbs; i++) {
913: n = ii[1] - ii[0]; ii++;
914: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
915: ib = idx + ai[i];
916: for (j=0; j<n; j++) {
917: rval = ib[j]*3;
918: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
919: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
920: z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3;
921: v += 9;
922: }
923: xb += 3;
924: }
925: break;
926: case 4:
927: for (i=0; i<mbs; i++) {
928: n = ii[1] - ii[0]; ii++;
929: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
930: ib = idx + ai[i];
931: for (j=0; j<n; j++) {
932: rval = ib[j]*4;
933: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
934: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
935: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
936: z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
937: v += 16;
938: }
939: xb += 4;
940: }
941: break;
942: case 5:
943: for (i=0; i<mbs; i++) {
944: n = ii[1] - ii[0]; ii++;
945: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
946: x4 = xb[3]; x5 = xb[4];
947: ib = idx + ai[i];
948: for (j=0; j<n; j++) {
949: rval = ib[j]*5;
950: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
951: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
952: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
953: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
954: z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
955: v += 25;
956: }
957: xb += 5;
958: }
959: break;
960: case 6:
961: for (i=0; i<mbs; i++) {
962: n = ii[1] - ii[0]; ii++;
963: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
964: x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
965: ib = idx + ai[i];
966: for (j=0; j<n; j++) {
967: rval = ib[j]*6;
968: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
969: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
970: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
971: z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
972: z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
973: z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
974: v += 36;
975: }
976: xb += 6;
977: }
978: break;
979: case 7:
980: for (i=0; i<mbs; i++) {
981: n = ii[1] - ii[0]; ii++;
982: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
983: x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
984: ib = idx + ai[i];
985: for (j=0; j<n; j++) {
986: rval = ib[j]*7;
987: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
988: z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
989: z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
990: z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
991: z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
992: z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
993: z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
994: v += 49;
995: }
996: xb += 7;
997: }
998: break;
999: default: { /* block sizes larger then 7 by 7 are handled by BLAS */
1000: int ncols,k;
1001: PetscScalar *work,*workt;
1003: if (!a->mult_work) {
1004: k = PetscMax(A->m,A->n);
1005: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1006: }
1007: work = a->mult_work;
1008: for (i=0; i<mbs; i++) {
1009: n = ii[1] - ii[0]; ii++;
1010: ncols = n*bs;
1011: ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));
1012: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1013: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1014: v += n*bs2;
1015: x += bs;
1016: workt = work;
1017: for (j=0; j<n; j++) {
1018: zb = z + bs*(*idx++);
1019: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1020: workt += bs;
1021: }
1022: }
1023: }
1024: }
1025: VecRestoreArray(xx,&xg);
1026: VecRestoreArray(zz,&zg);
1027: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1028: return(0);
1029: }
1031: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1033: {
1034: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1035: PetscScalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1036: MatScalar *v;
1037: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1040: if (yy != zz) { VecCopy(yy,zz); }
1041: VecGetArray(xx,&xg); x = xg;
1042: VecGetArray(zz,&zg); z = zg;
1045: idx = a->j;
1046: v = a->a;
1047: ii = a->i;
1048: xb = x;
1050: switch (bs) {
1051: case 1:
1052: for (i=0; i<mbs; i++) {
1053: n = ii[1] - ii[0]; ii++;
1054: x1 = xb[0];
1055: ib = idx + ai[i];
1056: for (j=0; j<n; j++) {
1057: rval = ib[j];
1058: z[rval] += *v * x1;
1059: v++;
1060: }
1061: xb++;
1062: }
1063: break;
1064: case 2:
1065: for (i=0; i<mbs; i++) {
1066: n = ii[1] - ii[0]; ii++;
1067: x1 = xb[0]; x2 = xb[1];
1068: ib = idx + ai[i];
1069: for (j=0; j<n; j++) {
1070: rval = ib[j]*2;
1071: z[rval++] += v[0]*x1 + v[1]*x2;
1072: z[rval++] += v[2]*x1 + v[3]*x2;
1073: v += 4;
1074: }
1075: xb += 2;
1076: }
1077: break;
1078: case 3:
1079: for (i=0; i<mbs; i++) {
1080: n = ii[1] - ii[0]; ii++;
1081: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1082: ib = idx + ai[i];
1083: for (j=0; j<n; j++) {
1084: rval = ib[j]*3;
1085: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1086: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1087: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1088: v += 9;
1089: }
1090: xb += 3;
1091: }
1092: break;
1093: case 4:
1094: for (i=0; i<mbs; i++) {
1095: n = ii[1] - ii[0]; ii++;
1096: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1097: ib = idx + ai[i];
1098: for (j=0; j<n; j++) {
1099: rval = ib[j]*4;
1100: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1101: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1102: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1103: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1104: v += 16;
1105: }
1106: xb += 4;
1107: }
1108: break;
1109: case 5:
1110: for (i=0; i<mbs; i++) {
1111: n = ii[1] - ii[0]; ii++;
1112: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1113: x4 = xb[3]; x5 = xb[4];
1114: ib = idx + ai[i];
1115: for (j=0; j<n; j++) {
1116: rval = ib[j]*5;
1117: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1118: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1119: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1120: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1121: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1122: v += 25;
1123: }
1124: xb += 5;
1125: }
1126: break;
1127: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1128: int ncols,k;
1129: PetscScalar *work,*workt;
1131: if (!a->mult_work) {
1132: k = PetscMax(A->m,A->n);
1133: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1134: }
1135: work = a->mult_work;
1136: for (i=0; i<mbs; i++) {
1137: n = ii[1] - ii[0]; ii++;
1138: ncols = n*bs;
1139: ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));
1140: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1141: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1142: v += n*bs2;
1143: x += bs;
1144: workt = work;
1145: for (j=0; j<n; j++) {
1146: zb = z + bs*(*idx++);
1147: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1148: workt += bs;
1149: }
1150: }
1151: }
1152: }
1153: VecRestoreArray(xx,&xg);
1154: VecRestoreArray(zz,&zg);
1155: PetscLogFlops(2*a->nz*a->bs2);
1156: return(0);
1157: }
1159: int MatScale_SeqBAIJ(PetscScalar *alpha,Mat inA)
1160: {
1161: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1162: int totalnz = a->bs2*a->nz;
1163: #if defined(PETSC_USE_MAT_SINGLE)
1164: int i;
1165: #else
1166: int one = 1;
1167: #endif
1170: #if defined(PETSC_USE_MAT_SINGLE)
1171: for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1172: #else
1173: BLscal_(&totalnz,alpha,a->a,&one);
1174: #endif
1175: PetscLogFlops(totalnz);
1176: return(0);
1177: }
1179: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1180: {
1181: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1182: MatScalar *v = a->a;
1183: PetscReal sum = 0.0;
1184: int i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1187: if (type == NORM_FROBENIUS) {
1188: for (i=0; i< bs2*nz; i++) {
1189: #if defined(PETSC_USE_COMPLEX)
1190: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1191: #else
1192: sum += (*v)*(*v); v++;
1193: #endif
1194: }
1195: *norm = sqrt(sum);
1196: } else if (type == NORM_INFINITY) { /* maximum row sum */
1197: *norm = 0.0;
1198: for (k=0; k<bs; k++) {
1199: for (j=0; j<a->mbs; j++) {
1200: v = a->a + bs2*a->i[j] + k;
1201: sum = 0.0;
1202: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1203: for (k1=0; k1<bs; k1++){
1204: sum += PetscAbsScalar(*v);
1205: v += bs;
1206: }
1207: }
1208: if (sum > *norm) *norm = sum;
1209: }
1210: }
1211: } else {
1212: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1213: }
1214: return(0);
1215: }
1218: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1219: {
1220: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1221: int ierr;
1222: PetscTruth flag;
1225: PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1226: if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1228: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1229: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1230: *flg = PETSC_FALSE;
1231: return(0);
1232: }
1233:
1234: /* if the a->i are the same */
1235: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1236: if (*flg == PETSC_FALSE) {
1237: return(0);
1238: }
1239:
1240: /* if a->j are the same */
1241: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1242: if (*flg == PETSC_FALSE) {
1243: return(0);
1244: }
1245: /* if a->a are the same */
1246: PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1247: return(0);
1248:
1249: }
1251: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1252: {
1253: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1254: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1255: PetscScalar *x,zero = 0.0;
1256: MatScalar *aa,*aa_j;
1259: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1260: bs = a->bs;
1261: aa = a->a;
1262: ai = a->i;
1263: aj = a->j;
1264: ambs = a->mbs;
1265: bs2 = a->bs2;
1267: VecSet(&zero,v);
1268: VecGetArray(v,&x);
1269: VecGetLocalSize(v,&n);
1270: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1271: for (i=0; i<ambs; i++) {
1272: for (j=ai[i]; j<ai[i+1]; j++) {
1273: if (aj[j] == i) {
1274: row = i*bs;
1275: aa_j = aa+j*bs2;
1276: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1277: break;
1278: }
1279: }
1280: }
1281: VecRestoreArray(v,&x);
1282: return(0);
1283: }
1285: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1286: {
1287: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1288: PetscScalar *l,*r,x,*li,*ri;
1289: MatScalar *aa,*v;
1290: int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1293: ai = a->i;
1294: aj = a->j;
1295: aa = a->a;
1296: m = A->m;
1297: n = A->n;
1298: bs = a->bs;
1299: mbs = a->mbs;
1300: bs2 = a->bs2;
1301: if (ll) {
1302: VecGetArray(ll,&l);
1303: VecGetLocalSize(ll,&lm);
1304: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1305: for (i=0; i<mbs; i++) { /* for each block row */
1306: M = ai[i+1] - ai[i];
1307: li = l + i*bs;
1308: v = aa + bs2*ai[i];
1309: for (j=0; j<M; j++) { /* for each block */
1310: for (k=0; k<bs2; k++) {
1311: (*v++) *= li[k%bs];
1312: }
1313: }
1314: }
1315: VecRestoreArray(ll,&l);
1316: PetscLogFlops(a->nz);
1317: }
1318:
1319: if (rr) {
1320: VecGetArray(rr,&r);
1321: VecGetLocalSize(rr,&rn);
1322: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1323: for (i=0; i<mbs; i++) { /* for each block row */
1324: M = ai[i+1] - ai[i];
1325: v = aa + bs2*ai[i];
1326: for (j=0; j<M; j++) { /* for each block */
1327: ri = r + bs*aj[ai[i]+j];
1328: for (k=0; k<bs; k++) {
1329: x = ri[k];
1330: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1331: }
1332: }
1333: }
1334: VecRestoreArray(rr,&r);
1335: PetscLogFlops(a->nz);
1336: }
1337: return(0);
1338: }
1341: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1342: {
1343: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1346: info->rows_global = (double)A->m;
1347: info->columns_global = (double)A->n;
1348: info->rows_local = (double)A->m;
1349: info->columns_local = (double)A->n;
1350: info->block_size = a->bs2;
1351: info->nz_allocated = a->maxnz;
1352: info->nz_used = a->bs2*a->nz;
1353: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1354: info->assemblies = A->num_ass;
1355: info->mallocs = a->reallocs;
1356: info->memory = A->mem;
1357: if (A->factor) {
1358: info->fill_ratio_given = A->info.fill_ratio_given;
1359: info->fill_ratio_needed = A->info.fill_ratio_needed;
1360: info->factor_mallocs = A->info.factor_mallocs;
1361: } else {
1362: info->fill_ratio_given = 0;
1363: info->fill_ratio_needed = 0;
1364: info->factor_mallocs = 0;
1365: }
1366: return(0);
1367: }
1370: int MatZeroEntries_SeqBAIJ(Mat A)
1371: {
1372: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1373: int ierr;
1376: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1377: return(0);
1378: }