Actual source code: sbaij2.c
1: #define PETSCMAT_DLL
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/inline/spops.h
5: #include src/inline/ilu.h
6: #include petscbt.h
7: #include src/mat/impls/sbaij/seq/sbaij.h
11: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12: {
13: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
15: PetscInt brow,i,j,k,l,mbs,n,*idx,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table,table0;
19: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20: mbs = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap.bs;
24: PetscBTCreate(mbs,table);
25: PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);
26: PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);
27: PetscBTCreate(mbs,table0);
29: for (i=0; i<is_max; i++) { /* for each is */
30: isz = 0;
31: PetscBTMemzero(mbs,table);
32:
33: /* Extract the indices, assume there can be duplicate entries */
34: ISGetIndices(is[i],&idx);
35: ISGetLocalSize(is[i],&n);
37: /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
38: bcol_max = 0;
39: for (j=0; j<n ; ++j){
40: brow = idx[j]/bs; /* convert the indices into block indices */
41: if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42: if(!PetscBTLookupSet(table,brow)) {
43: nidx[isz++] = brow;
44: if (bcol_max < brow) bcol_max = brow;
45: }
46: }
47: ISRestoreIndices(is[i],&idx);
48: ISDestroy(is[i]);
49:
50: k = 0;
51: for (j=0; j<ov; j++){ /* for each overlap */
52: /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53: PetscBTMemzero(mbs,table0);
54: for (l=k; l<isz; l++) { PetscBTSet(table0,nidx[l]); }
56: n = isz; /* length of the updated is[i] */
57: for (brow=0; brow<mbs; brow++){
58: start = ai[brow]; end = ai[brow+1];
59: if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
60: for (l = start; l<end ; l++){
61: bcol = aj[l];
62: if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l<end ; l++){
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table0,bcol)){
71: if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
79: /* expand the Index Set */
80: for (j=0; j<isz; j++) {
81: for (k=0; k<bs; k++)
82: nidx2[j*bs+k] = nidx[j]*bs+k;
83: }
84: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
85: }
86: PetscBTDestroy(table);
87: PetscFree(nidx);
88: PetscFree(nidx2);
89: PetscBTDestroy(table0);
90: return(0);
91: }
95: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
96: {
97: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
99: PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
100: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
101: PetscInt *irow,nrows,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
102: PetscInt *aj = a->j,*ai = a->i;
103: MatScalar *mat_a;
104: Mat C;
105: PetscTruth flag;
108: if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
109: ISSorted(iscol,(PetscTruth*)&i);
110: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
112: ISGetIndices(isrow,&irow);
113: ISGetSize(isrow,&nrows);
114:
115: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
116: ssmap = smap;
117: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
118: PetscMemzero(smap,oldcols*sizeof(PetscInt));
119: for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
120: /* determine lens of each row */
121: for (i=0; i<nrows; i++) {
122: kstart = ai[irow[i]];
123: kend = kstart + a->ilen[irow[i]];
124: lens[i] = 0;
125: for (k=kstart; k<kend; k++) {
126: if (ssmap[aj[k]]) {
127: lens[i]++;
128: }
129: }
130: }
131: /* Create and fill new matrix */
132: if (scall == MAT_REUSE_MATRIX) {
133: c = (Mat_SeqSBAIJ *)((*B)->data);
135: if (c->mbs!=nrows || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
136: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
137: if (!flag) {
138: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
139: }
140: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
141: C = *B;
142: } else {
143: MatCreate(A->comm,&C);
144: MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);
145: MatSetType(C,A->type_name);
146: MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
147: }
148: c = (Mat_SeqSBAIJ *)(C->data);
149: for (i=0; i<nrows; i++) {
150: row = irow[i];
151: kstart = ai[row];
152: kend = kstart + a->ilen[row];
153: mat_i = c->i[i];
154: mat_j = c->j + mat_i;
155: mat_a = c->a + mat_i*bs2;
156: mat_ilen = c->ilen + i;
157: for (k=kstart; k<kend; k++) {
158: if ((tcol=ssmap[a->j[k]])) {
159: *mat_j++ = tcol - 1;
160: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
161: mat_a += bs2;
162: (*mat_ilen)++;
163: }
164: }
165: }
166:
167: /* Free work space */
168: PetscFree(smap);
169: PetscFree(lens);
170: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
171: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
172:
173: ISRestoreIndices(isrow,&irow);
174: *B = C;
175: return(0);
176: }
180: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
181: {
182: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
183: IS is1;
185: PetscInt *vary,*iary,*irow,nrows,i,bs=A->rmap.bs,count;
188: if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
189:
190: ISGetIndices(isrow,&irow);
191: ISGetSize(isrow,&nrows);
192:
193: /* Verify if the indices corespond to each element in a block
194: and form the IS with compressed IS */
195: PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
196: iary = vary + a->mbs;
197: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
198: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
199:
200: count = 0;
201: for (i=0; i<a->mbs; i++) {
202: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
203: if (vary[i]==bs) iary[count++] = i;
204: }
205: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
206:
207: ISRestoreIndices(isrow,&irow);
208: PetscFree(vary);
210: MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);
211: ISDestroy(is1);
212: return(0);
213: }
217: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218: {
220: PetscInt i;
223: if (scall == MAT_INITIAL_MATRIX) {
224: PetscMalloc((n+1)*sizeof(Mat),B);
225: }
227: for (i=0; i<n; i++) {
228: MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
229: }
230: return(0);
231: }
233: /* -------------------------------------------------------*/
234: /* Should check that shapes of vectors and matrices match */
235: /* -------------------------------------------------------*/
236: #include petscblaslapack.h
240: PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
241: {
242: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
243: PetscScalar *x,*z,*xb,x1,zero=0.0;
244: MatScalar *v;
246: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
249: VecSet(zz,zero);
250: VecGetArray(xx,&x);
251: VecGetArray(zz,&z);
253: v = a->a;
254: xb = x;
255:
256: for (i=0; i<mbs; i++) {
257: n = ai[1] - ai[0]; /* length of i_th row of A */
258: x1 = *xb++;
259: ib = aj + *ai++;
260: jmin = 0;
261: /* if we ALWAYS required a diagonal entry then could remove this if test */
262: /* should we use a tmp to hold the accumulated z[i] */
263: if (*ib == i) { /* (diag of A)*x */
264: z[i] += *v++ * x[*ib++];
265: jmin++;
266: }
267: for (j=jmin; j<n; j++) {
268: cval = *ib;
269: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
270: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
271: }
272: }
274: VecRestoreArray(xx,&x);
275: VecRestoreArray(zz,&z);
276: PetscLogFlops(2*(a->nz*2 - A->rmap.N) - A->rmap.N); /* nz = (nz+m)/2 */
277: return(0);
278: }
282: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
283: {
284: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
285: PetscScalar *x,*z,*xb,x1,x2,zero=0.0;
286: MatScalar *v;
288: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
291: VecSet(zz,zero);
292: VecGetArray(xx,&x);
293: VecGetArray(zz,&z);
294:
295: v = a->a;
296: xb = x;
298: for (i=0; i<mbs; i++) {
299: n = ai[1] - ai[0]; /* length of i_th block row of A */
300: x1 = xb[0]; x2 = xb[1];
301: ib = aj + *ai;
302: jmin = 0;
303: if (*ib == i){ /* (diag of A)*x */
304: z[2*i] += v[0]*x1 + v[2]*x2;
305: z[2*i+1] += v[2]*x1 + v[3]*x2;
306: v += 4; jmin++;
307: }
308: for (j=jmin; j<n; j++) {
309: /* (strict lower triangular part of A)*x */
310: cval = ib[j]*2;
311: z[cval] += v[0]*x1 + v[1]*x2;
312: z[cval+1] += v[2]*x1 + v[3]*x2;
313: /* (strict upper triangular part of A)*x */
314: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
315: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
316: v += 4;
317: }
318: xb +=2; ai++;
319: }
321: VecRestoreArray(xx,&x);
322: VecRestoreArray(zz,&z);
323: PetscLogFlops(8*(a->nz*2 - A->rmap.N) - A->rmap.N);
324: return(0);
325: }
329: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
330: {
331: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
332: PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0;
333: MatScalar *v;
335: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
338: VecSet(zz,zero);
339: VecGetArray(xx,&x);
340: VecGetArray(zz,&z);
341:
342: v = a->a;
343: xb = x;
345: for (i=0; i<mbs; i++) {
346: n = ai[1] - ai[0]; /* length of i_th block row of A */
347: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
348: ib = aj + *ai;
349: jmin = 0;
350: if (*ib == i){ /* (diag of A)*x */
351: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
352: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
353: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
354: v += 9; jmin++;
355: }
356: for (j=jmin; j<n; j++) {
357: /* (strict lower triangular part of A)*x */
358: cval = ib[j]*3;
359: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
360: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
361: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
362: /* (strict upper triangular part of A)*x */
363: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
364: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
365: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
366: v += 9;
367: }
368: xb +=3; ai++;
369: }
371: VecRestoreArray(xx,&x);
372: VecRestoreArray(zz,&z);
373: PetscLogFlops(18*(a->nz*2 - A->rmap.N) - A->rmap.N);
374: return(0);
375: }
379: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
380: {
381: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
382: PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
383: MatScalar *v;
385: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
388: VecSet(zz,zero);
389: VecGetArray(xx,&x);
390: VecGetArray(zz,&z);
391:
392: v = a->a;
393: xb = x;
395: for (i=0; i<mbs; i++) {
396: n = ai[1] - ai[0]; /* length of i_th block row of A */
397: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
398: ib = aj + *ai;
399: jmin = 0;
400: if (*ib == i){ /* (diag of A)*x */
401: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
402: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
403: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
404: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
405: v += 16; jmin++;
406: }
407: for (j=jmin; j<n; j++) {
408: /* (strict lower triangular part of A)*x */
409: cval = ib[j]*4;
410: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
411: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
412: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
413: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
414: /* (strict upper triangular part of A)*x */
415: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
416: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
417: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
418: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
419: v += 16;
420: }
421: xb +=4; ai++;
422: }
424: VecRestoreArray(xx,&x);
425: VecRestoreArray(zz,&z);
426: PetscLogFlops(32*(a->nz*2 - A->rmap.N) - A->rmap.N);
427: return(0);
428: }
432: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
433: {
434: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
435: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
436: MatScalar *v;
438: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
441: VecSet(zz,zero);
442: VecGetArray(xx,&x);
443: VecGetArray(zz,&z);
444:
445: v = a->a;
446: xb = x;
448: for (i=0; i<mbs; i++) {
449: n = ai[1] - ai[0]; /* length of i_th block row of A */
450: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
451: ib = aj + *ai;
452: jmin = 0;
453: if (*ib == i){ /* (diag of A)*x */
454: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
455: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
456: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
457: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
458: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
459: v += 25; jmin++;
460: }
461: for (j=jmin; j<n; j++) {
462: /* (strict lower triangular part of A)*x */
463: cval = ib[j]*5;
464: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
465: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
466: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
467: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
468: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
469: /* (strict upper triangular part of A)*x */
470: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
471: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
472: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
473: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
474: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
475: v += 25;
476: }
477: xb +=5; ai++;
478: }
480: VecRestoreArray(xx,&x);
481: VecRestoreArray(zz,&z);
482: PetscLogFlops(50*(a->nz*2 - A->rmap.N) - A->rmap.N);
483: return(0);
484: }
489: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
490: {
491: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
492: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
493: MatScalar *v;
495: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
498: VecSet(zz,zero);
499: VecGetArray(xx,&x);
500: VecGetArray(zz,&z);
501:
502: v = a->a;
503: xb = x;
505: for (i=0; i<mbs; i++) {
506: n = ai[1] - ai[0]; /* length of i_th block row of A */
507: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
508: ib = aj + *ai;
509: jmin = 0;
510: if (*ib == i){ /* (diag of A)*x */
511: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
512: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
513: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
514: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
515: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
516: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
517: v += 36; jmin++;
518: }
519: for (j=jmin; j<n; j++) {
520: /* (strict lower triangular part of A)*x */
521: cval = ib[j]*6;
522: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
523: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
524: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
525: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
526: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
527: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
528: /* (strict upper triangular part of A)*x */
529: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
530: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
531: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
532: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
533: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
534: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
535: v += 36;
536: }
537: xb +=6; ai++;
538: }
540: VecRestoreArray(xx,&x);
541: VecRestoreArray(zz,&z);
542: PetscLogFlops(72*(a->nz*2 - A->rmap.N) - A->rmap.N);
543: return(0);
544: }
547: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
548: {
549: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
550: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
551: MatScalar *v;
553: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
556: VecSet(zz,zero);
557: VecGetArray(xx,&x);
558: VecGetArray(zz,&z);
559:
560: v = a->a;
561: xb = x;
563: for (i=0; i<mbs; i++) {
564: n = ai[1] - ai[0]; /* length of i_th block row of A */
565: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
566: ib = aj + *ai;
567: jmin = 0;
568: if (*ib == i){ /* (diag of A)*x */
569: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
570: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
571: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
572: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
573: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
574: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
575: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
576: v += 49; jmin++;
577: }
578: for (j=jmin; j<n; j++) {
579: /* (strict lower triangular part of A)*x */
580: cval = ib[j]*7;
581: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
582: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
583: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
584: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
585: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
586: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
587: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
588: /* (strict upper triangular part of A)*x */
589: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
590: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
591: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
592: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
593: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
594: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
595: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
596: v += 49;
597: }
598: xb +=7; ai++;
599: }
600: VecRestoreArray(xx,&x);
601: VecRestoreArray(zz,&z);
602: PetscLogFlops(98*(a->nz*2 - A->rmap.N) - A->rmap.N);
603: return(0);
604: }
606: /*
607: This will not work with MatScalar == float because it calls the BLAS
608: */
611: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
612: {
613: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
614: PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
615: MatScalar *v;
617: PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k;
620: VecSet(zz,zero);
621: VecGetArray(xx,&x); x_ptr=x;
622: VecGetArray(zz,&z); z_ptr=z;
624: aj = a->j;
625: v = a->a;
626: ii = a->i;
628: if (!a->mult_work) {
629: PetscMalloc((A->rmap.N+1)*sizeof(PetscScalar),&a->mult_work);
630: }
631: work = a->mult_work;
632:
633: for (i=0; i<mbs; i++) {
634: n = ii[1] - ii[0]; ncols = n*bs;
635: workt = work; idx=aj+ii[0];
637: /* upper triangular part */
638: for (j=0; j<n; j++) {
639: xb = x_ptr + bs*(*idx++);
640: for (k=0; k<bs; k++) workt[k] = xb[k];
641: workt += bs;
642: }
643: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
644: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
645:
646: /* strict lower triangular part */
647: idx = aj+ii[0];
648: if (*idx == i){
649: ncols -= bs; v += bs2; idx++; n--;
650: }
651:
652: if (ncols > 0){
653: workt = work;
654: PetscMemzero(workt,ncols*sizeof(PetscScalar));
655: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
656: for (j=0; j<n; j++) {
657: zb = z_ptr + bs*(*idx++);
658: for (k=0; k<bs; k++) zb[k] += workt[k] ;
659: workt += bs;
660: }
661: }
662: x += bs; v += n*bs2; z += bs; ii++;
663: }
664:
665: VecRestoreArray(xx,&x);
666: VecRestoreArray(zz,&z);
667: PetscLogFlops(2*(a->nz*2 - A->rmap.N)*bs2 - A->rmap.N);
668: return(0);
669: }
674: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
675: {
676: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
677: PetscScalar *x,*z,*xb,x1;
678: MatScalar *v;
680: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
683: VecCopy_Seq(yy,zz);
684: VecGetArray(xx,&x);
685: VecGetArray(zz,&z);
686: v = a->a;
687: xb = x;
689: for (i=0; i<mbs; i++) {
690: n = ai[1] - ai[0]; /* length of i_th row of A */
691: x1 = xb[0];
692: ib = aj + *ai;
693: jmin = 0;
694: if (*ib == i) { /* (diag of A)*x */
695: z[i] += *v++ * x[*ib++]; jmin++;
696: }
697: for (j=jmin; j<n; j++) {
698: cval = *ib;
699: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
700: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
701: }
702: xb++; ai++;
703: }
705: VecRestoreArray(xx,&x);
706: VecRestoreArray(zz,&z);
707:
708: PetscLogFlops(2*(a->nz*2 - A->rmap.n));
709: return(0);
710: }
714: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
715: {
716: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
717: PetscScalar *x,*z,*xb,x1,x2;
718: MatScalar *v;
720: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
723: VecCopy_Seq(yy,zz);
724: VecGetArray(xx,&x);
725: VecGetArray(zz,&z);
727: v = a->a;
728: xb = x;
730: for (i=0; i<mbs; i++) {
731: n = ai[1] - ai[0]; /* length of i_th block row of A */
732: x1 = xb[0]; x2 = xb[1];
733: ib = aj + *ai;
734: jmin = 0;
735: if (*ib == i){ /* (diag of A)*x */
736: z[2*i] += v[0]*x1 + v[2]*x2;
737: z[2*i+1] += v[2]*x1 + v[3]*x2;
738: v += 4; jmin++;
739: }
740: for (j=jmin; j<n; j++) {
741: /* (strict lower triangular part of A)*x */
742: cval = ib[j]*2;
743: z[cval] += v[0]*x1 + v[1]*x2;
744: z[cval+1] += v[2]*x1 + v[3]*x2;
745: /* (strict upper triangular part of A)*x */
746: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
747: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
748: v += 4;
749: }
750: xb +=2; ai++;
751: }
752: VecRestoreArray(xx,&x);
753: VecRestoreArray(zz,&z);
755: PetscLogFlops(4*(a->nz*2 - A->rmap.n));
756: return(0);
757: }
761: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
762: {
763: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
764: PetscScalar *x,*z,*xb,x1,x2,x3;
765: MatScalar *v;
767: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
770: VecCopy_Seq(yy,zz);
771: VecGetArray(xx,&x);
772: VecGetArray(zz,&z);
774: v = a->a;
775: xb = x;
777: for (i=0; i<mbs; i++) {
778: n = ai[1] - ai[0]; /* length of i_th block row of A */
779: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
780: ib = aj + *ai;
781: jmin = 0;
782: if (*ib == i){ /* (diag of A)*x */
783: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
784: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
785: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
786: v += 9; jmin++;
787: }
788: for (j=jmin; j<n; j++) {
789: /* (strict lower triangular part of A)*x */
790: cval = ib[j]*3;
791: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
792: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
793: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
794: /* (strict upper triangular part of A)*x */
795: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
796: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
797: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
798: v += 9;
799: }
800: xb +=3; ai++;
801: }
803: VecRestoreArray(xx,&x);
804: VecRestoreArray(zz,&z);
806: PetscLogFlops(18*(a->nz*2 - A->rmap.n));
807: return(0);
808: }
812: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
813: {
814: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
815: PetscScalar *x,*z,*xb,x1,x2,x3,x4;
816: MatScalar *v;
818: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
821: VecCopy_Seq(yy,zz);
822: VecGetArray(xx,&x);
823: VecGetArray(zz,&z);
825: v = a->a;
826: xb = x;
828: for (i=0; i<mbs; i++) {
829: n = ai[1] - ai[0]; /* length of i_th block row of A */
830: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
831: ib = aj + *ai;
832: jmin = 0;
833: if (*ib == i){ /* (diag of A)*x */
834: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
835: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
836: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
837: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
838: v += 16; jmin++;
839: }
840: for (j=jmin; j<n; j++) {
841: /* (strict lower triangular part of A)*x */
842: cval = ib[j]*4;
843: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
844: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
845: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
846: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
847: /* (strict upper triangular part of A)*x */
848: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
849: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
850: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
851: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
852: v += 16;
853: }
854: xb +=4; ai++;
855: }
857: VecRestoreArray(xx,&x);
858: VecRestoreArray(zz,&z);
860: PetscLogFlops(32*(a->nz*2 - A->rmap.n));
861: return(0);
862: }
866: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
867: {
868: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
869: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5;
870: MatScalar *v;
872: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
875: VecCopy_Seq(yy,zz);
876: VecGetArray(xx,&x);
877: VecGetArray(zz,&z);
879: v = a->a;
880: xb = x;
882: for (i=0; i<mbs; i++) {
883: n = ai[1] - ai[0]; /* length of i_th block row of A */
884: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
885: ib = aj + *ai;
886: jmin = 0;
887: if (*ib == i){ /* (diag of A)*x */
888: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
889: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
890: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
891: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
892: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
893: v += 25; jmin++;
894: }
895: for (j=jmin; j<n; j++) {
896: /* (strict lower triangular part of A)*x */
897: cval = ib[j]*5;
898: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
899: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
900: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
901: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
902: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
903: /* (strict upper triangular part of A)*x */
904: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
905: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
906: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
907: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
908: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
909: v += 25;
910: }
911: xb +=5; ai++;
912: }
914: VecRestoreArray(xx,&x);
915: VecRestoreArray(zz,&z);
917: PetscLogFlops(50*(a->nz*2 - A->rmap.n));
918: return(0);
919: }
922: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
923: {
924: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
925: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6;
926: MatScalar *v;
928: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
931: VecCopy_Seq(yy,zz);
932: VecGetArray(xx,&x);
933: VecGetArray(zz,&z);
935: v = a->a;
936: xb = x;
938: for (i=0; i<mbs; i++) {
939: n = ai[1] - ai[0]; /* length of i_th block row of A */
940: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
941: ib = aj + *ai;
942: jmin = 0;
943: if (*ib == i){ /* (diag of A)*x */
944: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
945: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
946: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
947: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
948: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
949: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
950: v += 36; jmin++;
951: }
952: for (j=jmin; j<n; j++) {
953: /* (strict lower triangular part of A)*x */
954: cval = ib[j]*6;
955: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
956: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
957: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
958: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
959: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
960: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
961: /* (strict upper triangular part of A)*x */
962: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
963: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
964: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
965: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
966: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
967: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
968: v += 36;
969: }
970: xb +=6; ai++;
971: }
973: VecRestoreArray(xx,&x);
974: VecRestoreArray(zz,&z);
976: PetscLogFlops(72*(a->nz*2 - A->rmap.n));
977: return(0);
978: }
982: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
983: {
984: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
985: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
986: MatScalar *v;
988: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
991: VecCopy_Seq(yy,zz);
992: VecGetArray(xx,&x);
993: VecGetArray(zz,&z);
995: v = a->a;
996: xb = x;
998: for (i=0; i<mbs; i++) {
999: n = ai[1] - ai[0]; /* length of i_th block row of A */
1000: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1001: ib = aj + *ai;
1002: jmin = 0;
1003: if (*ib == i){ /* (diag of A)*x */
1004: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1005: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1006: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1007: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1008: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1009: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1010: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1011: v += 49; jmin++;
1012: }
1013: for (j=jmin; j<n; j++) {
1014: /* (strict lower triangular part of A)*x */
1015: cval = ib[j]*7;
1016: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1017: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1018: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1019: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1020: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1021: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1022: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1023: /* (strict upper triangular part of A)*x */
1024: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1025: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1026: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1027: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1028: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1029: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1030: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1031: v += 49;
1032: }
1033: xb +=7; ai++;
1034: }
1036: VecRestoreArray(xx,&x);
1037: VecRestoreArray(zz,&z);
1039: PetscLogFlops(98*(a->nz*2 - A->rmap.n));
1040: return(0);
1041: }
1045: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1046: {
1047: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1048: PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1049: MatScalar *v;
1051: PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2,ncols,k;
1054: VecCopy_Seq(yy,zz);
1055: VecGetArray(xx,&x); x_ptr=x;
1056: VecGetArray(zz,&z); z_ptr=z;
1058: aj = a->j;
1059: v = a->a;
1060: ii = a->i;
1062: if (!a->mult_work) {
1063: PetscMalloc((A->rmap.n+1)*sizeof(PetscScalar),&a->mult_work);
1064: }
1065: work = a->mult_work;
1066:
1067:
1068: for (i=0; i<mbs; i++) {
1069: n = ii[1] - ii[0]; ncols = n*bs;
1070: workt = work; idx=aj+ii[0];
1072: /* upper triangular part */
1073: for (j=0; j<n; j++) {
1074: xb = x_ptr + bs*(*idx++);
1075: for (k=0; k<bs; k++) workt[k] = xb[k];
1076: workt += bs;
1077: }
1078: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1079: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1081: /* strict lower triangular part */
1082: idx = aj+ii[0];
1083: if (*idx == i){
1084: ncols -= bs; v += bs2; idx++; n--;
1085: }
1086: if (ncols > 0){
1087: workt = work;
1088: PetscMemzero(workt,ncols*sizeof(PetscScalar));
1089: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1090: for (j=0; j<n; j++) {
1091: zb = z_ptr + bs*(*idx++);
1092: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1093: workt += bs;
1094: }
1095: }
1097: x += bs; v += n*bs2; z += bs; ii++;
1098: }
1100: VecRestoreArray(xx,&x);
1101: VecRestoreArray(zz,&z);
1103: PetscLogFlops(2*(a->nz*2 - A->rmap.n));
1104: return(0);
1105: }
1109: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1110: {
1111: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1112: PetscScalar oalpha = alpha;
1113: PetscBLASInt one = 1,totalnz = (PetscBLASInt)a->bs2*a->nz;
1117: BLASscal_(&totalnz,&oalpha,a->a,&one);
1118: PetscLogFlops(totalnz);
1119: return(0);
1120: }
1124: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1125: {
1126: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1127: MatScalar *v = a->a;
1128: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1129: PetscInt i,j,k,bs = A->rmap.bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1131: PetscInt *jl,*il,jmin,jmax,nexti,ik,*col;
1132:
1134: if (type == NORM_FROBENIUS) {
1135: for (k=0; k<mbs; k++){
1136: jmin = a->i[k]; jmax = a->i[k+1];
1137: col = aj + jmin;
1138: if (*col == k){ /* diagonal block */
1139: for (i=0; i<bs2; i++){
1140: #if defined(PETSC_USE_COMPLEX)
1141: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1142: #else
1143: sum_diag += (*v)*(*v); v++;
1144: #endif
1145: }
1146: jmin++;
1147: }
1148: for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */
1149: for (i=0; i<bs2; i++){
1150: #if defined(PETSC_USE_COMPLEX)
1151: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1152: #else
1153: sum_off += (*v)*(*v); v++;
1154: #endif
1155: }
1156: }
1157: }
1158: *norm = sqrt(sum_diag + 2*sum_off);
1159: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1160: PetscMalloc((2*mbs+1)*sizeof(PetscInt)+bs*sizeof(PetscReal),&il);
1161: jl = il + mbs;
1162: sum = (PetscReal*)(jl + mbs);
1163: for (i=0; i<mbs; i++) jl[i] = mbs;
1164: il[0] = 0;
1166: *norm = 0.0;
1167: for (k=0; k<mbs; k++) { /* k_th block row */
1168: for (j=0; j<bs; j++) sum[j]=0.0;
1169: /*-- col sum --*/
1170: i = jl[k]; /* first |A(i,k)| to be added */
1171: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1172: at step k */
1173: while (i<mbs){
1174: nexti = jl[i]; /* next block row to be added */
1175: ik = il[i]; /* block index of A(i,k) in the array a */
1176: for (j=0; j<bs; j++){
1177: v = a->a + ik*bs2 + j*bs;
1178: for (k1=0; k1<bs; k1++) {
1179: sum[j] += PetscAbsScalar(*v); v++;
1180: }
1181: }
1182: /* update il, jl */
1183: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1184: jmax = a->i[i+1];
1185: if (jmin < jmax){
1186: il[i] = jmin;
1187: j = a->j[jmin];
1188: jl[i] = jl[j]; jl[j]=i;
1189: }
1190: i = nexti;
1191: }
1192: /*-- row sum --*/
1193: jmin = a->i[k]; jmax = a->i[k+1];
1194: for (i=jmin; i<jmax; i++) {
1195: for (j=0; j<bs; j++){
1196: v = a->a + i*bs2 + j;
1197: for (k1=0; k1<bs; k1++){
1198: sum[j] += PetscAbsScalar(*v); v += bs;
1199: }
1200: }
1201: }
1202: /* add k_th block row to il, jl */
1203: col = aj+jmin;
1204: if (*col == k) jmin++;
1205: if (jmin < jmax){
1206: il[k] = jmin;
1207: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1208: }
1209: for (j=0; j<bs; j++){
1210: if (sum[j] > *norm) *norm = sum[j];
1211: }
1212: }
1213: PetscFree(il);
1214: } else {
1215: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1216: }
1217: return(0);
1218: }
1222: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1223: {
1224: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1229: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1230: if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1231: *flg = PETSC_FALSE;
1232: return(0);
1233: }
1234:
1235: /* if the a->i are the same */
1236: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1237: if (!*flg) {
1238: return(0);
1239: }
1240:
1241: /* if a->j are the same */
1242: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1243: if (!*flg) {
1244: return(0);
1245: }
1246: /* if a->a are the same */
1247: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(A->rmap.bs)*sizeof(PetscScalar),flg);
1248: return(0);
1249: }
1253: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1254: {
1255: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1257: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1258: PetscScalar *x,zero = 0.0;
1259: MatScalar *aa,*aa_j;
1262: bs = A->rmap.bs;
1263: if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1264:
1265: aa = a->a;
1266: ai = a->i;
1267: aj = a->j;
1268: ambs = a->mbs;
1269: bs2 = a->bs2;
1271: VecSet(v,zero);
1272: VecGetArray(v,&x);
1273: VecGetLocalSize(v,&n);
1274: if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1275: for (i=0; i<ambs; i++) {
1276: j=ai[i];
1277: if (aj[j] == i) { /* if this is a diagonal element */
1278: row = i*bs;
1279: aa_j = aa + j*bs2;
1280: if (A->factor && bs==1){
1281: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1282: } else {
1283: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1284: }
1285: }
1286: }
1287:
1288: VecRestoreArray(v,&x);
1289: return(0);
1290: }
1294: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1295: {
1296: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1297: PetscScalar *l,x,*li,*ri;
1298: MatScalar *aa,*v;
1300: PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1301: PetscTruth flg;
1304: if (ll != rr){
1305: VecEqual(ll,rr,&flg);
1306: if (!flg)
1307: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1308: }
1309: if (!ll) return(0);
1310: ai = a->i;
1311: aj = a->j;
1312: aa = a->a;
1313: m = A->rmap.N;
1314: bs = A->rmap.bs;
1315: mbs = a->mbs;
1316: bs2 = a->bs2;
1318: VecGetArray(ll,&l);
1319: VecGetLocalSize(ll,&lm);
1320: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1321: for (i=0; i<mbs; i++) { /* for each block row */
1322: M = ai[i+1] - ai[i];
1323: li = l + i*bs;
1324: v = aa + bs2*ai[i];
1325: for (j=0; j<M; j++) { /* for each block */
1326: ri = l + bs*aj[ai[i]+j];
1327: for (k=0; k<bs; k++) {
1328: x = ri[k];
1329: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1330: }
1331: }
1332: }
1333: VecRestoreArray(ll,&l);
1334: PetscLogFlops(2*a->nz);
1335: return(0);
1336: }
1340: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1341: {
1342: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1345: info->rows_global = (double)A->rmap.N;
1346: info->columns_global = (double)A->rmap.N;
1347: info->rows_local = (double)A->rmap.N;
1348: info->columns_local = (double)A->rmap.N;
1349: info->block_size = a->bs2;
1350: info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */
1351: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1352: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1353: info->assemblies = A->num_ass;
1354: info->mallocs = a->reallocs;
1355: info->memory = A->mem;
1356: if (A->factor) {
1357: info->fill_ratio_given = A->info.fill_ratio_given;
1358: info->fill_ratio_needed = A->info.fill_ratio_needed;
1359: info->factor_mallocs = A->info.factor_mallocs;
1360: } else {
1361: info->fill_ratio_given = 0;
1362: info->fill_ratio_needed = 0;
1363: info->factor_mallocs = 0;
1364: }
1365: return(0);
1366: }
1371: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1372: {
1373: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1377: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1378: return(0);
1379: }
1383: PetscErrorCode MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1384: {
1385: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1387: PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1388: PetscReal atmp;
1389: MatScalar *aa;
1390: PetscScalar zero = 0.0,*x;
1391: PetscInt ncols,brow,bcol,krow,kcol;
1394: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1395: bs = A->rmap.bs;
1396: aa = a->a;
1397: ai = a->i;
1398: aj = a->j;
1399: mbs = a->mbs;
1401: VecSet(v,zero);
1402: VecGetArray(v,&x);
1403: VecGetLocalSize(v,&n);
1404: if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1405: for (i=0; i<mbs; i++) {
1406: ncols = ai[1] - ai[0]; ai++;
1407: brow = bs*i;
1408: for (j=0; j<ncols; j++){
1409: bcol = bs*(*aj);
1410: for (kcol=0; kcol<bs; kcol++){
1411: col = bcol + kcol; /* col index */
1412: for (krow=0; krow<bs; krow++){
1413: atmp = PetscAbsScalar(*aa); aa++;
1414: row = brow + krow; /* row index */
1415: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1416: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1417: }
1418: }
1419: aj++;
1420: }
1421: }
1422: VecRestoreArray(v,&x);
1423: return(0);
1424: }