Actual source code: sbaij2.c
1: /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/vec/vecimpl.h
5: #include src/inline/spops.h
6: #include src/inline/ilu.h
7: #include petscbt.h
8: #include src/mat/impls/sbaij/seq/sbaij.h
10: int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
11: {
13: SETERRQ(1,"Function not yet written for SBAIJ format");
14: /* return(0); */
15: }
17: int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
18: {
19: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
20: int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
21: int row,mat_i,*mat_j,tcol,*mat_ilen;
22: int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
23: int *aj = a->j,*ai = a->i;
24: MatScalar *mat_a;
25: Mat C;
26: PetscTruth flag;
29:
30: if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
31: ISSorted(iscol,(PetscTruth*)&i);
32: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
34: ISGetIndices(isrow,&irow);
35: ISGetSize(isrow,&nrows);
36:
37: ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);
38: ssmap = smap;
39: ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);
40: ierr = PetscMemzero(smap,oldcols*sizeof(int));
41: for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
42: /* determine lens of each row */
43: for (i=0; i<nrows; i++) {
44: kstart = ai[irow[i]];
45: kend = kstart + a->ilen[irow[i]];
46: lens[i] = 0;
47: for (k=kstart; k<kend; k++) {
48: if (ssmap[aj[k]]) {
49: lens[i]++;
50: }
51: }
52: }
53: /* Create and fill new matrix */
54: if (scall == MAT_REUSE_MATRIX) {
55: c = (Mat_SeqSBAIJ *)((*B)->data);
57: if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
58: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
59: if (flag == PETSC_FALSE) {
60: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
61: }
62: PetscMemzero(c->ilen,c->mbs*sizeof(int));
63: C = *B;
64: } else {
65: MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);
66: }
67: c = (Mat_SeqSBAIJ *)(C->data);
68: for (i=0; i<nrows; i++) {
69: row = irow[i];
70: kstart = ai[row];
71: kend = kstart + a->ilen[row];
72: mat_i = c->i[i];
73: mat_j = c->j + mat_i;
74: mat_a = c->a + mat_i*bs2;
75: mat_ilen = c->ilen + i;
76: for (k=kstart; k<kend; k++) {
77: if ((tcol=ssmap[a->j[k]])) {
78: *mat_j++ = tcol - 1;
79: ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
80: mat_a += bs2;
81: (*mat_ilen)++;
82: }
83: }
84: }
85:
86: /* Free work space */
87: PetscFree(smap);
88: PetscFree(lens);
89: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
90: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
91:
92: ISRestoreIndices(isrow,&irow);
93: *B = C;
94: return(0);
95: }
97: int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
98: {
99: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
100: IS is1;
101: int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
104: if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
105:
106: ISGetIndices(isrow,&irow);
107: ISGetSize(isrow,&nrows);
108:
109: /* Verify if the indices corespond to each element in a block
110: and form the IS with compressed IS */
111: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
112: iary = vary + a->mbs;
113: PetscMemzero(vary,(a->mbs)*sizeof(int));
114: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
115:
116: count = 0;
117: for (i=0; i<a->mbs; i++) {
118: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
119: if (vary[i]==bs) iary[count++] = i;
120: }
121: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
122:
123: ISRestoreIndices(isrow,&irow);
124: PetscFree(vary);
126: MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);
127: ISDestroy(is1);
128: return(0);
129: }
131: int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
132: {
133: int ierr,i;
136: if (scall == MAT_INITIAL_MATRIX) {
137: PetscMalloc((n+1)*sizeof(Mat),B);
138: }
140: for (i=0; i<n; i++) {
141: MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
142: }
143: return(0);
144: }
146: /* -------------------------------------------------------*/
147: /* Should check that shapes of vectors and matrices match */
148: /* -------------------------------------------------------*/
149: #include petscblaslapack.h
151: int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
152: {
153: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
154: PetscScalar *x,*z,*xb,x1,zero=0.0;
155: MatScalar *v;
156: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
159: VecSet(&zero,zz);
160: VecGetArray(xx,&x);
161: VecGetArray(zz,&z);
163: v = a->a;
164: xb = x;
165:
166: for (i=0; i<mbs; i++) {
167: n = ai[1] - ai[0]; /* length of i_th row of A */
168: x1 = xb[0];
169: ib = aj + *ai;
170: jmin = 0;
171: if (*ib == i) { /* (diag of A)*x */
172: z[i] += *v++ * x[*ib++];
173: jmin++;
174: }
175: for (j=jmin; j<n; j++) {
176: cval = *ib;
177: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
178: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
179: }
180: xb++; ai++;
181: }
183: VecRestoreArray(xx,&x);
184: VecRestoreArray(zz,&z);
185: PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m); /* s_nz = (nz+m)/2 */
186: return(0);
187: }
189: int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
190: {
191: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
192: PetscScalar *x,*z,*xb,x1,x2,zero=0.0;
193: MatScalar *v;
194: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
198: VecSet(&zero,zz);
199: VecGetArray(xx,&x);
200: VecGetArray(zz,&z);
201:
202: v = a->a;
203: xb = x;
205: for (i=0; i<mbs; i++) {
206: n = ai[1] - ai[0]; /* length of i_th block row of A */
207: x1 = xb[0]; x2 = xb[1];
208: ib = aj + *ai;
209: jmin = 0;
210: if (*ib == i){ /* (diag of A)*x */
211: z[2*i] += v[0]*x1 + v[2]*x2;
212: z[2*i+1] += v[2]*x1 + v[3]*x2;
213: v += 4; jmin++;
214: }
215: for (j=jmin; j<n; j++) {
216: /* (strict lower triangular part of A)*x */
217: cval = ib[j]*2;
218: z[cval] += v[0]*x1 + v[1]*x2;
219: z[cval+1] += v[2]*x1 + v[3]*x2;
220: /* (strict upper triangular part of A)*x */
221: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
222: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
223: v += 4;
224: }
225: xb +=2; ai++;
226: }
228: VecRestoreArray(xx,&x);
229: VecRestoreArray(zz,&z);
230: PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
231: return(0);
232: }
234: int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
235: {
236: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
237: PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0;
238: MatScalar *v;
239: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
243: VecSet(&zero,zz);
244: VecGetArray(xx,&x);
245: VecGetArray(zz,&z);
246:
247: v = a->a;
248: xb = x;
250: for (i=0; i<mbs; i++) {
251: n = ai[1] - ai[0]; /* length of i_th block row of A */
252: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
253: ib = aj + *ai;
254: jmin = 0;
255: if (*ib == i){ /* (diag of A)*x */
256: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
257: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
258: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
259: v += 9; jmin++;
260: }
261: for (j=jmin; j<n; j++) {
262: /* (strict lower triangular part of A)*x */
263: cval = ib[j]*3;
264: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
265: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
266: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
267: /* (strict upper triangular part of A)*x */
268: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
269: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
270: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
271: v += 9;
272: }
273: xb +=3; ai++;
274: }
276: VecRestoreArray(xx,&x);
277: VecRestoreArray(zz,&z);
278: PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
279: return(0);
280: }
282: int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
283: {
284: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
285: PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
286: MatScalar *v;
287: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
290: VecSet(&zero,zz);
291: VecGetArray(xx,&x);
292: VecGetArray(zz,&z);
293:
294: v = a->a;
295: xb = x;
297: for (i=0; i<mbs; i++) {
298: n = ai[1] - ai[0]; /* length of i_th block row of A */
299: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
300: ib = aj + *ai;
301: jmin = 0;
302: if (*ib == i){ /* (diag of A)*x */
303: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
304: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
305: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
306: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
307: v += 16; jmin++;
308: }
309: for (j=jmin; j<n; j++) {
310: /* (strict lower triangular part of A)*x */
311: cval = ib[j]*4;
312: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
313: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
314: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
315: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
316: /* (strict upper triangular part of A)*x */
317: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
318: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
319: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
320: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
321: v += 16;
322: }
323: xb +=4; ai++;
324: }
326: VecRestoreArray(xx,&x);
327: VecRestoreArray(zz,&z);
328: PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
329: return(0);
330: }
332: int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
333: {
334: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
335: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
336: MatScalar *v;
337: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
340: VecSet(&zero,zz);
341: VecGetArray(xx,&x);
342: VecGetArray(zz,&z);
343:
344: v = a->a;
345: xb = x;
347: for (i=0; i<mbs; i++) {
348: n = ai[1] - ai[0]; /* length of i_th block row of A */
349: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
350: ib = aj + *ai;
351: jmin = 0;
352: if (*ib == i){ /* (diag of A)*x */
353: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
354: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
355: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
356: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
357: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
358: v += 25; jmin++;
359: }
360: for (j=jmin; j<n; j++) {
361: /* (strict lower triangular part of A)*x */
362: cval = ib[j]*5;
363: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
364: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
365: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
366: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
367: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
368: /* (strict upper triangular part of A)*x */
369: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
370: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
371: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
372: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
373: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
374: v += 25;
375: }
376: xb +=5; ai++;
377: }
379: VecRestoreArray(xx,&x);
380: VecRestoreArray(zz,&z);
381: PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
382: return(0);
383: }
386: int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
387: {
388: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
389: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
390: MatScalar *v;
391: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
394: VecSet(&zero,zz);
395: VecGetArray(xx,&x);
396: VecGetArray(zz,&z);
397:
398: v = a->a;
399: xb = x;
401: for (i=0; i<mbs; i++) {
402: n = ai[1] - ai[0]; /* length of i_th block row of A */
403: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
404: ib = aj + *ai;
405: jmin = 0;
406: if (*ib == i){ /* (diag of A)*x */
407: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
408: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
409: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
410: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
411: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
412: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
413: v += 36; jmin++;
414: }
415: for (j=jmin; j<n; j++) {
416: /* (strict lower triangular part of A)*x */
417: cval = ib[j]*6;
418: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
419: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
420: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
421: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
422: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
423: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
424: /* (strict upper triangular part of A)*x */
425: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
426: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
427: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
428: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
429: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
430: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
431: v += 36;
432: }
433: xb +=6; ai++;
434: }
436: VecRestoreArray(xx,&x);
437: VecRestoreArray(zz,&z);
438: PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
439: return(0);
440: }
441: int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
442: {
443: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
444: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
445: MatScalar *v;
446: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
449: VecSet(&zero,zz);
450: VecGetArray(xx,&x);
451: VecGetArray(zz,&z);
452:
453: v = a->a;
454: xb = x;
456: for (i=0; i<mbs; i++) {
457: n = ai[1] - ai[0]; /* length of i_th block row of A */
458: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
459: ib = aj + *ai;
460: jmin = 0;
461: if (*ib == i){ /* (diag of A)*x */
462: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
463: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
464: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
465: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
466: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
467: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
468: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
469: v += 49; jmin++;
470: }
471: for (j=jmin; j<n; j++) {
472: /* (strict lower triangular part of A)*x */
473: cval = ib[j]*7;
474: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
475: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
476: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
477: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
478: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
479: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
480: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
481: /* (strict upper triangular part of A)*x */
482: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
483: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
484: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
485: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
486: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
487: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
488: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
489: v += 49;
490: }
491: xb +=7; ai++;
492: }
493: VecRestoreArray(xx,&x);
494: VecRestoreArray(zz,&z);
495: PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
496: return(0);
497: }
499: /*
500: This will not work with MatScalar == float because it calls the BLAS
501: */
502: int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
503: {
504: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
505: PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
506: MatScalar *v;
507: int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
508: int ncols,k;
511: VecSet(&zero,zz);
512: VecGetArray(xx,&x); x_ptr=x;
513: VecGetArray(zz,&z); z_ptr=z;
515: aj = a->j;
516: v = a->a;
517: ii = a->i;
519: if (!a->mult_work) {
520: PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);
521: }
522: work = a->mult_work;
523:
524: for (i=0; i<mbs; i++) {
525: n = ii[1] - ii[0]; ncols = n*bs;
526: workt = work; idx=aj+ii[0];
528: /* upper triangular part */
529: for (j=0; j<n; j++) {
530: xb = x_ptr + bs*(*idx++);
531: for (k=0; k<bs; k++) workt[k] = xb[k];
532: workt += bs;
533: }
534: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
535: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
536:
537: /* strict lower triangular part */
538: idx = aj+ii[0];
539: if (*idx == i){
540: ncols -= bs; v += bs2; idx++; n--;
541: }
542:
543: if (ncols > 0){
544: workt = work;
545: ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));
546: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
547: for (j=0; j<n; j++) {
548: zb = z_ptr + bs*(*idx++);
549: for (k=0; k<bs; k++) zb[k] += workt[k] ;
550: workt += bs;
551: }
552: }
553: x += bs; v += n*bs2; z += bs; ii++;
554: }
555:
556: VecRestoreArray(xx,&x);
557: VecRestoreArray(zz,&z);
558: PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
559: return(0);
560: }
562: int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
563: {
564: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
565: PetscScalar *x,*y,*z,*xb,x1;
566: MatScalar *v;
567: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
570: VecGetArray(xx,&x);
571: if (yy != xx) {
572: VecGetArray(yy,&y);
573: } else {
574: y = x;
575: }
576: if (zz != yy) {
577: /* VecCopy(yy,zz); */
578: VecGetArray(zz,&z);
579: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
580: } else {
581: z = y;
582: }
584: v = a->a;
585: xb = x;
587: for (i=0; i<mbs; i++) {
588: n = ai[1] - ai[0]; /* length of i_th row of A */
589: x1 = xb[0];
590: ib = aj + *ai;
591: jmin = 0;
592: if (*ib == i) { /* (diag of A)*x */
593: z[i] += *v++ * x[*ib++]; jmin++;
594: }
595: for (j=jmin; j<n; j++) {
596: cval = *ib;
597: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
598: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
599: }
600: xb++; ai++;
601: }
603: VecRestoreArray(xx,&x);
604: if (yy != xx) VecRestoreArray(yy,&y);
605: if (zz != yy) VecRestoreArray(zz,&z);
606:
607: PetscLogFlops(2*(a->s_nz*2 - A->m));
608: return(0);
609: }
611: int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
612: {
613: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
614: PetscScalar *x,*y,*z,*xb,x1,x2;
615: MatScalar *v;
616: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
619: VecGetArray(xx,&x);
620: if (yy != xx) {
621: VecGetArray(yy,&y);
622: } else {
623: y = x;
624: }
625: if (zz != yy) {
626: /* VecCopy(yy,zz); */
627: VecGetArray(zz,&z);
628: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
629: } else {
630: z = y;
631: }
633: v = a->a;
634: xb = x;
636: for (i=0; i<mbs; i++) {
637: n = ai[1] - ai[0]; /* length of i_th block row of A */
638: x1 = xb[0]; x2 = xb[1];
639: ib = aj + *ai;
640: jmin = 0;
641: if (*ib == i){ /* (diag of A)*x */
642: z[2*i] += v[0]*x1 + v[2]*x2;
643: z[2*i+1] += v[2]*x1 + v[3]*x2;
644: v += 4; jmin++;
645: }
646: for (j=jmin; j<n; j++) {
647: /* (strict lower triangular part of A)*x */
648: cval = ib[j]*2;
649: z[cval] += v[0]*x1 + v[1]*x2;
650: z[cval+1] += v[2]*x1 + v[3]*x2;
651: /* (strict upper triangular part of A)*x */
652: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
653: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
654: v += 4;
655: }
656: xb +=2; ai++;
657: }
659: VecRestoreArray(xx,&x);
660: if (yy != xx) VecRestoreArray(yy,&y);
661: if (zz != yy) VecRestoreArray(zz,&z);
663: PetscLogFlops(4*(a->s_nz*2 - A->m));
664: return(0);
665: }
667: int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
668: {
669: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
670: PetscScalar *x,*y,*z,*xb,x1,x2,x3;
671: MatScalar *v;
672: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
675: VecGetArray(xx,&x);
676: if (yy != xx) {
677: VecGetArray(yy,&y);
678: } else {
679: y = x;
680: }
681: if (zz != yy) {
682: /* VecCopy(yy,zz); */
683: VecGetArray(zz,&z);
684: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
685: } else {
686: z = y;
687: }
689: v = a->a;
690: xb = x;
692: for (i=0; i<mbs; i++) {
693: n = ai[1] - ai[0]; /* length of i_th block row of A */
694: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
695: ib = aj + *ai;
696: jmin = 0;
697: if (*ib == i){ /* (diag of A)*x */
698: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
699: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
700: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
701: v += 9; jmin++;
702: }
703: for (j=jmin; j<n; j++) {
704: /* (strict lower triangular part of A)*x */
705: cval = ib[j]*3;
706: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
707: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
708: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
709: /* (strict upper triangular part of A)*x */
710: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
711: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
712: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
713: v += 9;
714: }
715: xb +=3; ai++;
716: }
718: VecRestoreArray(xx,&x);
719: if (yy != xx) VecRestoreArray(yy,&y);
720: if (zz != yy) VecRestoreArray(zz,&z);
722: PetscLogFlops(18*(a->s_nz*2 - A->m));
723: return(0);
724: }
726: int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
727: {
728: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
729: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4;
730: MatScalar *v;
731: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
734: VecGetArray(xx,&x);
735: if (yy != xx) {
736: VecGetArray(yy,&y);
737: } else {
738: y = x;
739: }
740: if (zz != yy) {
741: /* VecCopy(yy,zz); */
742: VecGetArray(zz,&z);
743: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
744: } else {
745: z = y;
746: }
748: v = a->a;
749: xb = x;
751: for (i=0; i<mbs; i++) {
752: n = ai[1] - ai[0]; /* length of i_th block row of A */
753: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
754: ib = aj + *ai;
755: jmin = 0;
756: if (*ib == i){ /* (diag of A)*x */
757: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
758: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
759: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
760: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
761: v += 16; jmin++;
762: }
763: for (j=jmin; j<n; j++) {
764: /* (strict lower triangular part of A)*x */
765: cval = ib[j]*4;
766: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
767: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
768: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
769: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
770: /* (strict upper triangular part of A)*x */
771: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
772: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
773: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
774: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
775: v += 16;
776: }
777: xb +=4; ai++;
778: }
780: VecRestoreArray(xx,&x);
781: if (yy != xx) VecRestoreArray(yy,&y);
782: if (zz != yy) VecRestoreArray(zz,&z);
784: PetscLogFlops(32*(a->s_nz*2 - A->m));
785: return(0);
786: }
788: int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
789: {
790: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
791: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5;
792: MatScalar *v;
793: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
796: VecGetArray(xx,&x);
797: if (yy != xx) {
798: VecGetArray(yy,&y);
799: } else {
800: y = x;
801: }
802: if (zz != yy) {
803: /* VecCopy(yy,zz); */
804: VecGetArray(zz,&z);
805: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
806: } else {
807: z = y;
808: }
810: v = a->a;
811: xb = x;
813: for (i=0; i<mbs; i++) {
814: n = ai[1] - ai[0]; /* length of i_th block row of A */
815: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
816: ib = aj + *ai;
817: jmin = 0;
818: if (*ib == i){ /* (diag of A)*x */
819: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
820: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
821: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
822: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
823: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
824: v += 25; jmin++;
825: }
826: for (j=jmin; j<n; j++) {
827: /* (strict lower triangular part of A)*x */
828: cval = ib[j]*5;
829: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
830: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
831: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
832: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
833: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
834: /* (strict upper triangular part of A)*x */
835: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
836: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
837: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
838: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
839: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
840: v += 25;
841: }
842: xb +=5; ai++;
843: }
845: VecRestoreArray(xx,&x);
846: if (yy != xx) VecRestoreArray(yy,&y);
847: if (zz != yy) VecRestoreArray(zz,&z);
849: PetscLogFlops(50*(a->s_nz*2 - A->m));
850: return(0);
851: }
852: int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
853: {
854: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
855: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
856: MatScalar *v;
857: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
860: VecGetArray(xx,&x);
861: if (yy != xx) {
862: VecGetArray(yy,&y);
863: } else {
864: y = x;
865: }
866: if (zz != yy) {
867: /* VecCopy(yy,zz); */
868: VecGetArray(zz,&z);
869: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
870: } else {
871: z = y;
872: }
874: v = a->a;
875: xb = x;
877: for (i=0; i<mbs; i++) {
878: n = ai[1] - ai[0]; /* length of i_th block row of A */
879: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
880: ib = aj + *ai;
881: jmin = 0;
882: if (*ib == i){ /* (diag of A)*x */
883: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
884: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
885: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
886: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
887: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
888: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
889: v += 36; jmin++;
890: }
891: for (j=jmin; j<n; j++) {
892: /* (strict lower triangular part of A)*x */
893: cval = ib[j]*6;
894: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
895: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
896: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
897: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
898: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
899: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
900: /* (strict upper triangular part of A)*x */
901: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
902: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
903: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
904: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
905: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
906: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
907: v += 36;
908: }
909: xb +=6; ai++;
910: }
912: VecRestoreArray(xx,&x);
913: if (yy != xx) VecRestoreArray(yy,&y);
914: if (zz != yy) VecRestoreArray(zz,&z);
916: PetscLogFlops(72*(a->s_nz*2 - A->m));
917: return(0);
918: }
920: int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
921: {
922: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
923: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
924: MatScalar *v;
925: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
928: VecGetArray(xx,&x);
929: if (yy != xx) {
930: VecGetArray(yy,&y);
931: } else {
932: y = x;
933: }
934: if (zz != yy) {
935: /* VecCopy(yy,zz); */
936: VecGetArray(zz,&z);
937: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
938: } else {
939: z = y;
940: }
942: v = a->a;
943: xb = x;
945: for (i=0; i<mbs; i++) {
946: n = ai[1] - ai[0]; /* length of i_th block row of A */
947: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
948: ib = aj + *ai;
949: jmin = 0;
950: if (*ib == i){ /* (diag of A)*x */
951: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
952: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
953: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
954: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
955: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
956: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
957: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
958: v += 49; jmin++;
959: }
960: for (j=jmin; j<n; j++) {
961: /* (strict lower triangular part of A)*x */
962: cval = ib[j]*7;
963: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
964: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
965: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
966: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
967: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
968: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
969: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
970: /* (strict upper triangular part of A)*x */
971: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
972: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
973: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
974: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
975: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
976: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
977: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
978: v += 49;
979: }
980: xb +=7; ai++;
981: }
983: VecRestoreArray(xx,&x);
984: if (yy != xx) VecRestoreArray(yy,&y);
985: if (zz != yy) VecRestoreArray(zz,&z);
987: PetscLogFlops(98*(a->s_nz*2 - A->m));
988: return(0);
989: }
991: int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
992: {
993: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
994: PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
995: MatScalar *v;
996: int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
997: int ncols,k;
1000: VecGetArray(xx,&x); x_ptr=x;
1001: if (yy != xx) {
1002: VecGetArray(yy,&y);
1003: } else {
1004: y = x;
1005: }
1006: if (zz != yy) {
1007: /* VecCopy(yy,zz); */
1008: VecGetArray(zz,&z); z_ptr=z;
1009: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
1010: } else {
1011: z = y;
1012: }
1014: aj = a->j;
1015: v = a->a;
1016: ii = a->i;
1018: if (!a->mult_work) {
1019: PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);
1020: }
1021: work = a->mult_work;
1022:
1023:
1024: for (i=0; i<mbs; i++) {
1025: n = ii[1] - ii[0]; ncols = n*bs;
1026: workt = work; idx=aj+ii[0];
1028: /* upper triangular part */
1029: for (j=0; j<n; j++) {
1030: xb = x_ptr + bs*(*idx++);
1031: for (k=0; k<bs; k++) workt[k] = xb[k];
1032: workt += bs;
1033: }
1034: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1035: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1037: /* strict lower triangular part */
1038: idx = aj+ii[0];
1039: if (*idx == i){
1040: ncols -= bs; v += bs2; idx++; n--;
1041: }
1042: if (ncols > 0){
1043: workt = work;
1044: ierr = PetscMemzero(workt,ncols*sizeof(PetscScalar));
1045: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1046: for (j=0; j<n; j++) {
1047: zb = z_ptr + bs*(*idx++);
1048: /* idx++; */
1049: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1050: workt += bs;
1051: }
1052: }
1054: x += bs; v += n*bs2; z += bs; ii++;
1055: }
1057: VecRestoreArray(xx,&x);
1058: if (yy != xx) VecRestoreArray(yy,&y);
1059: if (zz != yy) VecRestoreArray(zz,&z);
1061: PetscLogFlops(2*(a->s_nz*2 - A->m));
1062: return(0);
1063: }
1065: int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1066: {
1068: SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1069: /* return(0); */
1070: }
1072: int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1074: {
1076: SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1077: /* return(0); */
1078: }
1080: int MatScale_SeqSBAIJ(PetscScalar *alpha,Mat inA)
1081: {
1082: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1083: int one = 1,totalnz = a->bs2*a->s_nz;
1086: BLscal_(&totalnz,alpha,a->a,&one);
1087: PetscLogFlops(totalnz);
1088: return(0);
1089: }
1091: int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1092: {
1093: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1094: MatScalar *v = a->a;
1095: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1096: int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1097: int *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1098:
1100: if (type == NORM_FROBENIUS) {
1101: for (k=0; k<mbs; k++){
1102: jmin = a->i[k]; jmax = a->i[k+1];
1103: col = aj + jmin;
1104: if (*col == k){ /* diagonal block */
1105: for (i=0; i<bs2; i++){
1106: #if defined(PETSC_USE_COMPLEX)
1107: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1108: #else
1109: sum_diag += (*v)*(*v); v++;
1110: #endif
1111: }
1112: jmin++;
1113: }
1114: for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */
1115: for (i=0; i<bs2; i++){
1116: #if defined(PETSC_USE_COMPLEX)
1117: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1118: #else
1119: sum_off += (*v)*(*v); v++;
1120: #endif
1121: }
1122: }
1123: }
1124: *norm = sqrt(sum_diag + 2*sum_off);
1126: } else if (type == NORM_INFINITY) { /* maximum row sum */
1127: PetscMalloc(mbs*sizeof(int),&il);
1128: PetscMalloc(mbs*sizeof(int),&jl);
1129: PetscMalloc(bs*sizeof(PetscReal),&sum);
1130: for (i=0; i<mbs; i++) {
1131: jl[i] = mbs; il[0] = 0;
1132: }
1134: *norm = 0.0;
1135: for (k=0; k<mbs; k++) { /* k_th block row */
1136: for (j=0; j<bs; j++) sum[j]=0.0;
1138: /*-- col sum --*/
1139: i = jl[k]; /* first |A(i,k)| to be added */
1140: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1141: at step k */
1142: while (i<mbs){
1143: nexti = jl[i]; /* next block row to be added */
1144: ik = il[i]; /* block index of A(i,k) in the array a */
1145: for (j=0; j<bs; j++){
1146: v = a->a + ik*bs2 + j*bs;
1147: for (k1=0; k1<bs; k1++) {
1148: sum[j] += PetscAbsScalar(*v); v++;
1149: }
1150: }
1151: /* update il, jl */
1152: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1153: jmax = a->i[i+1];
1154: if (jmin < jmax){
1155: il[i] = jmin;
1156: j = a->j[jmin];
1157: jl[i] = jl[j]; jl[j]=i;
1158: }
1159: i = nexti;
1160: }
1161:
1162: /*-- row sum --*/
1163: jmin = a->i[k]; jmax = a->i[k+1];
1164: for (i=jmin; i<jmax; i++) {
1165: for (j=0; j<bs; j++){
1166: v = a->a + i*bs2 + j;
1167: for (k1=0; k1<bs; k1++){
1168: sum[j] += PetscAbsScalar(*v);
1169: v += bs;
1170: }
1171: }
1172: }
1173: /* add k_th block row to il, jl */
1174: col = aj+jmin;
1175: if (*col == k) jmin++;
1176: if (jmin < jmax){
1177: il[k] = jmin;
1178: j = a->j[jmin];
1179: jl[k] = jl[j]; jl[j] = k;
1180: }
1181: for (j=0; j<bs; j++){
1182: if (sum[j] > *norm) *norm = sum[j];
1183: }
1184: }
1185: PetscFree(il);
1186: PetscFree(jl);
1187: PetscFree(sum);
1188: } else {
1189: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1190: }
1191: return(0);
1192: }
1194: int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1195: {
1196: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1197: int ierr;
1198: PetscTruth flag;
1201: PetscTypeCompare((PetscObject)A,MATSEQBAIJ,&flag);
1202: if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1204: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1205: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1206: *flg = PETSC_FALSE;
1207: return(0);
1208: }
1209:
1210: /* if the a->i are the same */
1211: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1212: if (*flg == PETSC_FALSE) {
1213: return(0);
1214: }
1215:
1216: /* if a->j are the same */
1217: PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);
1218: if (*flg == PETSC_FALSE) {
1219: return(0);
1220: }
1221: /* if a->a are the same */
1222: PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1223: return(0);
1224:
1225: }
1227: int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1228: {
1229: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1230: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1231: PetscScalar *x,zero = 0.0;
1232: MatScalar *aa,*aa_j;
1235: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1236: bs = a->bs;
1237: aa = a->a;
1238: ai = a->i;
1239: aj = a->j;
1240: ambs = a->mbs;
1241: bs2 = a->bs2;
1243: VecSet(&zero,v);
1244: VecGetArray(v,&x);
1245: VecGetLocalSize(v,&n);
1246: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1247: for (i=0; i<ambs; i++) {
1248: j=ai[i];
1249: if (aj[j] == i) { /* if this is a diagonal element */
1250: row = i*bs;
1251: aa_j = aa + j*bs2;
1252: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1253: }
1254: }
1255: VecRestoreArray(v,&x);
1256: return(0);
1257: }
1259: int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1260: {
1261: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1262: PetscScalar *l,*r,x,*li,*ri;
1263: MatScalar *aa,*v;
1264: int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1267: ai = a->i;
1268: aj = a->j;
1269: aa = a->a;
1270: m = A->m;
1271: bs = a->bs;
1272: mbs = a->mbs;
1273: bs2 = a->bs2;
1275: if (ll != rr) {
1276: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be samen");
1277: }
1278: if (ll) {
1279: VecGetArray(ll,&l);
1280: VecGetLocalSize(ll,&lm);
1281: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1282: for (i=0; i<mbs; i++) { /* for each block row */
1283: M = ai[i+1] - ai[i];
1284: li = l + i*bs;
1285: v = aa + bs2*ai[i];
1286: for (j=0; j<M; j++) { /* for each block */
1287: for (k=0; k<bs2; k++) {
1288: (*v++) *= li[k%bs];
1289: }
1290: #ifdef CONT
1291: /* will be used to replace the above loop */
1292: ri = l + bs*aj[ai[i]+j];
1293: for (k=0; k<bs; k++) { /* column value */
1294: x = ri[k];
1295: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1296: }
1297: #endif
1299: }
1300: }
1301: VecRestoreArray(ll,&l);
1302: PetscLogFlops(2*a->s_nz);
1303: }
1304: /* will be deleted */
1305: if (rr) {
1306: VecGetArray(rr,&r);
1307: VecGetLocalSize(rr,&rn);
1308: if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1309: for (i=0; i<mbs; i++) { /* for each block row */
1310: M = ai[i+1] - ai[i];
1311: v = aa + bs2*ai[i];
1312: for (j=0; j<M; j++) { /* for each block */
1313: ri = r + bs*aj[ai[i]+j];
1314: for (k=0; k<bs; k++) {
1315: x = ri[k];
1316: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1317: }
1318: }
1319: }
1320: VecRestoreArray(rr,&r);
1321: PetscLogFlops(a->s_nz);
1322: }
1323: return(0);
1324: }
1326: int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1327: {
1328: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1331: info->rows_global = (double)A->m;
1332: info->columns_global = (double)A->m;
1333: info->rows_local = (double)A->m;
1334: info->columns_local = (double)A->m;
1335: info->block_size = a->bs2;
1336: info->nz_allocated = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1337: info->nz_used = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1338: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1339: info->assemblies = A->num_ass;
1340: info->mallocs = a->reallocs;
1341: info->memory = A->mem;
1342: if (A->factor) {
1343: info->fill_ratio_given = A->info.fill_ratio_given;
1344: info->fill_ratio_needed = A->info.fill_ratio_needed;
1345: info->factor_mallocs = A->info.factor_mallocs;
1346: } else {
1347: info->fill_ratio_given = 0;
1348: info->fill_ratio_needed = 0;
1349: info->factor_mallocs = 0;
1350: }
1351: return(0);
1352: }
1355: int MatZeroEntries_SeqSBAIJ(Mat A)
1356: {
1357: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1358: int ierr;
1361: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1362: return(0);
1363: }
1365: int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1366: {
1367: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1368: int ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1369: PetscReal atmp;
1370: MatScalar *aa;
1371: PetscScalar zero = 0.0,*x;
1372: int ncols,brow,bcol,krow,kcol;
1375: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1376: bs = a->bs;
1377: aa = a->a;
1378: ai = a->i;
1379: aj = a->j;
1380: mbs = a->mbs;
1382: VecSet(&zero,v);
1383: VecGetArray(v,&x);
1384: VecGetLocalSize(v,&n);
1385: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1386: for (i=0; i<mbs; i++) {
1387: ncols = ai[1] - ai[0]; ai++;
1388: brow = bs*i;
1389: for (j=0; j<ncols; j++){
1390: bcol = bs*(*aj);
1391: for (kcol=0; kcol<bs; kcol++){
1392: col = bcol + kcol; /* col index */
1393: for (krow=0; krow<bs; krow++){
1394: atmp = PetscAbsScalar(*aa); aa++;
1395: row = brow + krow; /* row index */
1396: /* printf("val[%d,%d]: %gn",row,col,atmp); */
1397: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1398: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1399: }
1400: }
1401: aj++;
1402: }
1403: }
1404: VecRestoreArray(v,&x);
1405: return(0);
1406: }