Actual source code: axpy.h

  2: /* 
  3:    These are macros for daxpy like operations.  The format is
  4:    APXY(U,Alpha,P,n)
  5:    for
  6:    U += Alpha * P

  8:    In addition,versions that process 2 and 4 vectors are provided; 
  9:    these can give significantly better use of memory resources than
 10:    successive calls to the regular daxpy.
 11:  */

 13: #ifndef APXY

 15:  #include petscblaslapack.h

 17: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 18: #define fortrancopy_ FORTRANCOPY
 19: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 20: #define fortrancopy_ fortrancopy
 21: #endif

 26: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 27: #define fortranzero_ FORTRANZERO
 28: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 29: #define fortranzero_ fortranzero
 30: #endif


 36: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
 37: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 38: #define fortranaypx_ FORTRANAYPX
 39: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 40: #define fortranaypx_ fortranaypx
 41: #endif
 45: #endif

 47: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
 48: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 49: #define fortranwaxpy_ FORTRANWAXPY
 50: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 51: #define fortranwaxpy_ fortranwaxpy
 52: #endif
 56: #endif

 58: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)

 60: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 61: #define fortranmaxpy4_ FORTRANMAXPY4
 62: #define fortranmaxpy3_ FORTRANMAXPY3
 63: #define fortranmaxpy2_ FORTRANMAXPY2
 64: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 65: #define fortranmaxpy4_ fortranmaxpy4
 66: #define fortranmaxpy3_ fortranmaxpy3
 67: #define fortranmaxpy2_ fortranmaxpy2
 68: #endif

 71: EXTERN void fortranmaxpy4_(void*,void*,void*,void*,void*,void*,void*,void*,void*,PetscInt*);
 72: EXTERN void fortranmaxpy3_(void*,void*,void*,void*,void*,void*,void*,PetscInt*);
 73: EXTERN void fortranmaxpy2_(void*,void*,void*,void*,void*,PetscInt*);

 76: #define APXY(U,a1,p1,n)  {PetscBLASInt one=1;\
 77:   BLASaxpy_(&n,&a1,p1,&one,U,&one);}
 78: #define APXY2(U,a1,a2,p1,p2,n) { \
 79:   fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
 80: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) { \
 81:   fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
 82: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){ \
 83:   fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}

 85: #elif defined(PETSC_USE_UNROLL_KERNELS)

 87: #define APXY(U,Alpha,P,n) {\
 88:   switch (n & 0x3) {\
 89:   case 3: *U++    += Alpha * *P++;\
 90:   case 2: *U++    += Alpha * *P++;\
 91:   case 1: *U++    += Alpha * *P++;\
 92:   n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1];\
 93:                                      U[2] += Alpha * P[2]; U[3] += Alpha * P[3]; \
 94:                                      U += 4; P += 4; n -= 4;}}
 95: #define APXY2(U,a1,a2,p1,p2,n) {\
 96:   switch (n & 0x3) {\
 97:   case 3: *U++    += a1 * *p1++ + a2 * *p2++;\
 98:   case 2: *U++    += a1 * *p1++ + a2 * *p2++;\
 99:   case 1: *U++    += a1 * *p1++ + a2 * *p2++;\
100:   n -= 4;case 0: break;}\
101:   while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1];\
102:                U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
103: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
104:   switch (n & 0x3) {\
105:   case 3: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
106:   case 2: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
107:   case 1: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
108:   n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];\
109:   U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];\
110:   U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];\
111:   U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
112: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
113:   switch (n & 0x3) {\
114:   case 3: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
115:   case 2: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
116:   case 1: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
117:   n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
118:   U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];\
119:   U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];\
120:   U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}

122: #elif defined(PETSC_USE_WHILE_KERNELS)

124: #define APXY(U,a1,p1,n)  {\
125:   while (n--) *U++ += a1 * *p1++;}
126: #define APXY2(U,a1,a2,p1,p2,n)  {\
127:   while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
128: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
129:   while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
130: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
131:   while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}

133: #elif defined(PETSC_USE_BLAS_KERNELS)

135: #define APXY(U,a1,p1,n)  {PetscBLASInt one=1;\
136:   BLASaxpy_(&n,&a1,p1,&one,U,&one);}
137: #define APXY2(U,a1,a2,p1,p2,n){APXY(U,a1,p1,n);\
138:   APXY(U,a2,p2,n);}
139: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){APXY2(U,a1,a2,p1,p2,n);\
140:   APXY(U,a3,p3,n);}
141: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){APXY2(U,a1,a2,p1,p2,n);\
142:   APXY2(U,a3,a4,p3,p4,n);}

144: #elif defined(PETSC_USE_FOR_KERNELS)

146: #define APXY(U,a1,p1,n)  {PetscInt __i;PetscScalar __s1,__s2; \
147:   for(__i=0;__i<n-1;__i+=2){__s1=a1*p1[__i];__s2=a1*p1[__i+1];\
148:   __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}\
149:   if (n & 0x1) U[__i] += a1 * p1[__i];}
150: #define APXY2(U,a1,a2,p1,p2,n) {PetscInt __i;\
151:   for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
152: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i;\
153:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
154: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;\
155:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}

157: #else

159: #define APXY(U,a1,p1,n)  {PetscInt __i;PetscScalar _a1=a1;\
160:   for(__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
161: #define APXY2(U,a1,a2,p1,p2,n) {PetscInt __i;\
162:   for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
163: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i;\
164:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
165: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;\
166:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}

168: #endif


171: /* ----------------------------------------------------------------------------
172:       axpy() but for increments of inc in both U and P 
173:    ---------------------------------------------------------------------------*/
174: #ifdef PETSC_USE_UNROLL_KERNELS
175: #define APXYINC(U,Alpha,P,n,inc) {\
176: if (n & 0x1) {\
177:   *U    += Alpha * *P; U += inc; P += inc; n--;}\
178: while (n>0) {U[0] += Alpha * P[0];U[inc] += Alpha * P[inc];\
179:   U += 2*inc; P += 2*inc; n -= 2;}}
180: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
181: if (n & 0x1) {\
182:   *U    += a1 * *p1 + a2 * *p2; U += inc; p1 += inc; p2 += inc;n--;}\
183: while (n>0) {U[0] += a1*p1[0]+a2*p2[0];U[inc]+=a1*p1[inc]+a2*p2[inc];\
184:   U += 2*inc;p1 += 2*inc;p2+=2*inc; n -= 2;}}
185: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {\
186: if (n & 0x1) {\
187:    *U    += a1 * *p1 + a2 * *p2 + a3 * *p3; \
188:     U += inc; p1 += inc; p2 += inc; p3 += inc;n--;}\
189: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0];\
190:   U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc];\
191:   U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;n -= 2;}}
192: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
193: if (n & 0x1) {\
194:    *U    += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4; \
195:     U += inc; p1 += inc; p2 += inc; p3 += inc; p4 += inc;n--;}\
196: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
197:   U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc]+a4*p4[inc];\
198:   U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;p4+=2*inc; n -= 2;}}

200: #elif defined(PETSC_USE_WHILE_KERNELS)
201: #define APXYINC(U,a1,p1,n,inc) {\
202: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
203: #define APXY2INC(U,a1,a2,p1,p2,n,inc)  {\
204: while (n--) {*U += a1 * *p1 + a2 * *p2;\
205: U+=inc;p1+=inc;p2+=inc;}}
206: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc){\
207: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
208: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
209: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
210: p2+=inc;p3+=inc;p4+=inc;}}

212: #else
213: /* These need to be converted to for loops */
214: #define APXYINC(U,a1,p1,n,inc) {\
215: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
216: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
217: while (n--) {*U += a1 * *p1 + a2 * *p2;\
218: U+=inc;p1+=inc;p2+=inc;}}
219: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {\
220: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
221: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc){\
222: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
223: p2+=inc;p3+=inc;p4+=inc;}}
224: #endif

226: /* --------------------------------------------------------------------
227:    This is aypx:
228:     for (i=0; i<n; i++) 
229:        y[i] = x[i] + alpha * y[i];
230:   ---------------------------------------------------------------------*/
231: #if defined(PETSC_USE_UNROLL_KERNELS)
232: #define AYPX(U,Alpha,P,n) {\
233: switch (n & 0x3) {\
234: case 3: *U    = *P++ + Alpha * *U;U++;\
235: case 2: *U    = *P++ + Alpha * *U;U++;\
236: case 1: *U    = *P++ + Alpha * *U;U++;\
237: n -= 4;case 0: break;}while (n>0) {U[0] = P[0]+Alpha * U[0];\
238: U[1] = P[1] + Alpha * U[1];\
239: U[2] = P[2] + Alpha * U[2]; U[3] = P[3] + Alpha * U[3]; \
240: U += 4; P += 4; n -= 4;}}

242: #elif defined(PETSC_USE_WHILE_KERNELS)
243: #define AYPX(U,a1,p1,n)  {\
244: while (n--) {*U = *p1++ + a1 * *U;U++;}

246: #elif defined(PETSC_USE_FOR_KERNELS)
247: #define AYPX(U,a1,p1,n)  {PetscInt __i;PetscScalar __s1,__s2; \
248: for(__i=0;__i<n-1;__i+=2){__s1=p1[__i];__s2=p1[__i+1];\
249: __s1+=a1*U[__i];__s2+=a1*U[__i+1];\
250: U[__i]=__s1;U[__i+1]=__s2;}\
251: if (n & 0x1) U[__i] = p1[__i] + a1 * U[__i];}

253: #else
254: #define AYPX(U,a1,p1,n)  {PetscInt __i;\
255: for(__i=0;__i<n;__i++)U[__i]=p1[__i]+a1 * U[__i];}
256: #endif

258: /* ----------------------------------------------------------------------------------
259:        Useful for APXY where alpha == -1 
260:   ----------------------------------------------------------------------------------
261:   */
262: #define YMX(U,p1,n)  {PetscInt __i;\
263: for(__i=0;__i<n;__i++)U[__i]-=p1[__i];}
264: /* Useful for APXY where alpha == 1 */
265: #define YPX(U,p1,n)  {PetscInt __i;\
266: for(__i=0;__i<n;__i++)U[__i]+=p1[__i];}

268: #endif