00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
00026 #define EIGEN_PACKET_MATH_ALTIVEC_H
00027
00028 namespace internal {
00029
00030 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00031 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
00032 #endif
00033
00034 #ifndef EIGEN_HAS_FUSE_CJMADD
00035 #define EIGEN_HAS_FUSE_CJMADD 1
00036 #endif
00037
00038 #ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
00039 #define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*256*256
00040 #endif
00041
00042
00043 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00044 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
00045 #endif
00046
00047 typedef __vector float Packet4f;
00048 typedef __vector int Packet4i;
00049 typedef __vector unsigned int Packet4ui;
00050 typedef __vector __bool int Packet4bi;
00051 typedef __vector short int Packet8i;
00052 typedef __vector unsigned char Packet16uc;
00053
00054
00055
00056
00057 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
00058 Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
00059
00060 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
00061 Packet4i p4i_##NAME = vec_splat_s32(X)
00062
00063 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00064 Packet4f p4f_##NAME = pset1<Packet4f>(X)
00065
00066 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00067 Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
00068
00069 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00070 Packet4i p4i_##NAME = pset1<Packet4i>(X)
00071
00072 #define DST_CHAN 1
00073 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
00074
00075
00076 static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
00077 static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
00078 static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
00079 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
00080
00081 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
00082 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
00083 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
00084 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
00085 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
00086 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
00087 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
00088
00089 template<> struct packet_traits<float> : default_packet_traits
00090 {
00091 typedef Packet4f type;
00092 enum {
00093 Vectorizable = 1,
00094 AlignedOnScalar = 1,
00095 size=4,
00096
00097
00098 HasSin = 0,
00099 HasCos = 0,
00100 HasLog = 0,
00101 HasExp = 0,
00102 HasSqrt = 0
00103 };
00104 };
00105 template<> struct packet_traits<int> : default_packet_traits
00106 {
00107 typedef Packet4i type;
00108 enum {
00109
00110 Vectorizable = 1,
00111 AlignedOnScalar = 1,
00112 size=4
00113 };
00114 };
00115
00116 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
00117 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
00164
00165 float EIGEN_ALIGN16 af[4];
00166 af[0] = from;
00167 Packet4f vc = vec_ld(0, af);
00168 vc = vec_splat(vc, 0);
00169 return vc;
00170 }
00171
00172 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
00173 int EIGEN_ALIGN16 ai[4];
00174 ai[0] = from;
00175 Packet4i vc = vec_ld(0, ai);
00176 vc = vec_splat(vc, 0);
00177 return vc;
00178 }
00179
00180 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
00181 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
00182
00183 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
00184 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
00185
00186 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
00187 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
00188
00189 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
00190 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
00191
00192 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
00230 {
00231 Packet4f t, y_0, y_1, res;
00232
00233
00234 y_0 = vec_re(b);
00235
00236
00237 t = vec_nmsub(y_0, b, p4f_ONE);
00238 y_1 = vec_madd(y_0, t, y_0);
00239
00240 res = vec_madd(a, y_1, p4f_ZERO);
00241 return res;
00242 }
00243
00244 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& , const Packet4i& )
00245 { eigen_assert(false && "packet integer division are not supported by AltiVec");
00246 return pset1<Packet4i>(0);
00247 }
00248
00249
00250 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
00251 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00252
00253 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
00254 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
00255
00256 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
00257 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
00258
00259
00260 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
00261 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
00262
00263 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
00264 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
00265
00266 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
00267 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
00268
00269 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
00270 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
00271
00272 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00273 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
00274
00275 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00276 {
00277 EIGEN_DEBUG_ALIGNED_LOAD
00278
00279 Packet16uc MSQ, LSQ;
00280 Packet16uc mask;
00281 MSQ = vec_ld(0, (unsigned char *)from);
00282 LSQ = vec_ld(15, (unsigned char *)from);
00283 mask = vec_lvsl(0, from);
00284 return (Packet4f) vec_perm(MSQ, LSQ, mask);
00285
00286 }
00287 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00288 {
00289 EIGEN_DEBUG_ALIGNED_LOAD
00290
00291 Packet16uc MSQ, LSQ;
00292 Packet16uc mask;
00293 MSQ = vec_ld(0, (unsigned char *)from);
00294 LSQ = vec_ld(15, (unsigned char *)from);
00295 mask = vec_lvsl(0, from);
00296 return (Packet4i) vec_perm(MSQ, LSQ, mask);
00297 }
00298
00299 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00300 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
00301
00302 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
00303 {
00304 EIGEN_DEBUG_UNALIGNED_STORE
00305
00306
00307 Packet16uc MSQ, LSQ, edges;
00308 Packet16uc edgeAlign, align;
00309
00310 MSQ = vec_ld(0, (unsigned char *)to);
00311 LSQ = vec_ld(15, (unsigned char *)to);
00312 edgeAlign = vec_lvsl(0, to);
00313 edges=vec_perm(LSQ,MSQ,edgeAlign);
00314 align = vec_lvsr( 0, to );
00315 MSQ = vec_perm(edges,(Packet16uc)from,align);
00316 LSQ = vec_perm((Packet16uc)from,edges,align);
00317 vec_st( LSQ, 15, (unsigned char *)to );
00318 vec_st( MSQ, 0, (unsigned char *)to );
00319 }
00320 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
00321 {
00322 EIGEN_DEBUG_UNALIGNED_STORE
00323
00324
00325 Packet16uc MSQ, LSQ, edges;
00326 Packet16uc edgeAlign, align;
00327
00328 MSQ = vec_ld(0, (unsigned char *)to);
00329 LSQ = vec_ld(15, (unsigned char *)to);
00330 edgeAlign = vec_lvsl(0, to);
00331 edges=vec_perm(LSQ, MSQ, edgeAlign);
00332 align = vec_lvsr( 0, to );
00333 MSQ = vec_perm(edges, (Packet16uc) from, align);
00334 LSQ = vec_perm((Packet16uc) from, edges, align);
00335 vec_st( LSQ, 15, (unsigned char *)to );
00336 vec_st( MSQ, 0, (unsigned char *)to );
00337 }
00338
00339 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00340 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
00341
00342 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00343 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
00344
00345 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00346 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
00347
00348 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
00349 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
00350
00351 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00352 {
00353 Packet4f b, sum;
00354 b = (Packet4f) vec_sld(a, a, 8);
00355 sum = vec_add(a, b);
00356 b = (Packet4f) vec_sld(sum, sum, 4);
00357 sum = vec_add(sum, b);
00358 return pfirst(sum);
00359 }
00360
00361 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00362 {
00363 Packet4f v[4], sum[4];
00364
00365
00366
00367
00368 v[0] = vec_mergeh(vecs[0], vecs[2]);
00369 v[1] = vec_mergel(vecs[0], vecs[2]);
00370 v[2] = vec_mergeh(vecs[1], vecs[3]);
00371 v[3] = vec_mergel(vecs[1], vecs[3]);
00372
00373 sum[0] = vec_mergeh(v[0], v[2]);
00374 sum[1] = vec_mergel(v[0], v[2]);
00375 sum[2] = vec_mergeh(v[1], v[3]);
00376 sum[3] = vec_mergel(v[1], v[3]);
00377
00378
00379
00380 sum[0] = vec_add(sum[0], sum[1]);
00381
00382 sum[1] = vec_add(sum[2], sum[3]);
00383
00384 sum[0] = vec_add(sum[0], sum[1]);
00385
00386 return sum[0];
00387 }
00388
00389 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00390 {
00391 Packet4i sum;
00392 sum = vec_sums(a, p4i_ZERO);
00393 sum = vec_sld(sum, p4i_ZERO, 12);
00394 return pfirst(sum);
00395 }
00396
00397 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00398 {
00399 Packet4i v[4], sum[4];
00400
00401
00402
00403
00404 v[0] = vec_mergeh(vecs[0], vecs[2]);
00405 v[1] = vec_mergel(vecs[0], vecs[2]);
00406 v[2] = vec_mergeh(vecs[1], vecs[3]);
00407 v[3] = vec_mergel(vecs[1], vecs[3]);
00408
00409 sum[0] = vec_mergeh(v[0], v[2]);
00410 sum[1] = vec_mergel(v[0], v[2]);
00411 sum[2] = vec_mergeh(v[1], v[3]);
00412 sum[3] = vec_mergel(v[1], v[3]);
00413
00414
00415
00416 sum[0] = vec_add(sum[0], sum[1]);
00417
00418 sum[1] = vec_add(sum[2], sum[3]);
00419
00420 sum[0] = vec_add(sum[0], sum[1]);
00421
00422 return sum[0];
00423 }
00424
00425
00426
00427 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00428 {
00429 Packet4f prod;
00430 prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
00431 return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
00432 }
00433
00434 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00435 {
00436 EIGEN_ALIGN16 int aux[4];
00437 pstore(aux, a);
00438 return aux[0] * aux[1] * aux[2] * aux[3];
00439 }
00440
00441
00442 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00443 {
00444 Packet4f b, res;
00445 b = vec_min(a, vec_sld(a, a, 8));
00446 res = vec_min(b, vec_sld(b, b, 4));
00447 return pfirst(res);
00448 }
00449
00450 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00451 {
00452 Packet4i b, res;
00453 b = vec_min(a, vec_sld(a, a, 8));
00454 res = vec_min(b, vec_sld(b, b, 4));
00455 return pfirst(res);
00456 }
00457
00458
00459 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00460 {
00461 Packet4f b, res;
00462 b = vec_max(a, vec_sld(a, a, 8));
00463 res = vec_max(b, vec_sld(b, b, 4));
00464 return pfirst(res);
00465 }
00466
00467 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00468 {
00469 Packet4i b, res;
00470 b = vec_max(a, vec_sld(a, a, 8));
00471 res = vec_max(b, vec_sld(b, b, 4));
00472 return pfirst(res);
00473 }
00474
00475 template<int Offset>
00476 struct palign_impl<Offset,Packet4f>
00477 {
00478 EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
00479 {
00480 if (Offset!=0)
00481 first = vec_sld(first, second, Offset*4);
00482 }
00483 };
00484
00485 template<int Offset>
00486 struct palign_impl<Offset,Packet4i>
00487 {
00488 EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
00489 {
00490 if (Offset!=0)
00491 first = vec_sld(first, second, Offset*4);
00492 }
00493 };
00494
00495 }
00496
00497 #endif // EIGEN_PACKET_MATH_ALTIVEC_H