Main MRPT website > C++ reference
MRPT logo

PacketMath.h

Go to the documentation of this file.
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
00005 //
00006 // Eigen is free software; you can redistribute it and/or
00007 // modify it under the terms of the GNU Lesser General Public
00008 // License as published by the Free Software Foundation; either
00009 // version 3 of the License, or (at your option) any later version.
00010 //
00011 // Alternatively, you can redistribute it and/or
00012 // modify it under the terms of the GNU General Public License as
00013 // published by the Free Software Foundation; either version 2 of
00014 // the License, or (at your option) any later version.
00015 //
00016 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
00017 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00018 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
00019 // GNU General Public License for more details.
00020 //
00021 // You should have received a copy of the GNU Lesser General Public
00022 // License and a copy of the GNU General Public License along with
00023 // Eigen. If not, see <http://www.gnu.org/licenses/>.
00024 
00025 #ifndef EIGEN_PACKET_MATH_SSE_H
00026 #define EIGEN_PACKET_MATH_SSE_H
00027 
00028 namespace internal {
00029 
00030 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00031 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
00032 #endif
00033 
00034 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00035 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
00036 #endif
00037 
00038 typedef __m128  Packet4f;
00039 typedef __m128i Packet4i;
00040 typedef __m128d Packet2d;
00041 
00042 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
00043 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
00044 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
00045 
00046 #define vec4f_swizzle1(v,p,q,r,s) \
00047   (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
00048 
00049 #define vec4i_swizzle1(v,p,q,r,s) \
00050   (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
00051 
00052 #define vec2d_swizzle1(v,p,q) \
00053   (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
00054   
00055 #define vec4f_swizzle2(a,b,p,q,r,s) \
00056   (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
00057 
00058 #define vec4i_swizzle2(a,b,p,q,r,s) \
00059   (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
00060 
00061 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00062   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
00063 
00064 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00065   const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
00066 
00067 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00068   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
00069 
00070 
00071 template<> struct packet_traits<float>  : default_packet_traits
00072 {
00073   typedef Packet4f type;
00074   enum {
00075     Vectorizable = 1,
00076     AlignedOnScalar = 1,
00077     size=4,
00078 
00079     HasDiv    = 1,
00080     HasSin  = EIGEN_FAST_MATH,
00081     HasCos  = EIGEN_FAST_MATH,
00082     HasLog  = 1,
00083     HasExp  = 1,
00084     HasSqrt = 1
00085   };
00086 };
00087 template<> struct packet_traits<double> : default_packet_traits
00088 {
00089   typedef Packet2d type;
00090   enum {
00091     Vectorizable = 1,
00092     AlignedOnScalar = 1,
00093     size=2,
00094 
00095     HasDiv    = 1
00096   };
00097 };
00098 template<> struct packet_traits<int>    : default_packet_traits
00099 {
00100   typedef Packet4i type;
00101   enum {
00102     // FIXME check the Has*
00103     Vectorizable = 1,
00104     AlignedOnScalar = 1,
00105     size=4
00106   };
00107 };
00108 
00109 template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
00110 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
00111 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
00112 
00113 #ifdef __GNUC__
00114 // Sometimes GCC implements _mm_set1_p* using multiple moves,
00115 // that is inefficient :( (e.g., see gemm_pack_rhs)
00116 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
00117   Packet4f res = _mm_set_ss(from);
00118   return vec4f_swizzle1(res,0,0,0,0);
00119 }
00120 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
00121   // NOTE the SSE3 intrinsic _mm_loaddup_pd is never faster but sometimes much slower
00122   Packet2d res = _mm_set_sd(from);
00123   return vec2d_swizzle1(res, 0, 0);
00124 }
00125 #else
00126 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
00127 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
00128 #endif
00129 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
00130 
00131 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
00132 template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
00133 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
00134 
00135 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
00136 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
00137 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
00138 
00139 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
00140 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
00141 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
00142 
00143 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
00144 {
00145   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
00146   return _mm_xor_ps(a,mask);
00147 }
00148 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
00149 {
00150   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
00151   return _mm_xor_pd(a,mask);
00152 }
00153 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
00154 {
00155   return psub(_mm_setr_epi32(0,0,0,0), a);
00156 }
00157 
00158 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
00159 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
00160 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
00161 {
00162 #ifdef EIGEN_VECTORIZE_SSE4_1
00163   return _mm_mullo_epi32(a,b);
00164 #else
00165   // this version is slightly faster than 4 scalar products
00166   return vec4i_swizzle1(
00167             vec4i_swizzle2(
00168               _mm_mul_epu32(a,b),
00169               _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
00170                             vec4i_swizzle1(b,1,0,3,2)),
00171               0,2,0,2),
00172             0,2,1,3);
00173 #endif
00174 }
00175 
00176 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
00177 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
00178 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
00179 { eigen_assert(false && "packet integer division are not supported by SSE");
00180   return pset1<Packet4i>(0);
00181 }
00182 
00183 // for some weird raisons, it has to be overloaded for packet of integers
00184 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00185 
00186 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
00187 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
00188 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
00189 {
00190   // after some bench, this version *is* faster than a scalar implementation
00191   Packet4i mask = _mm_cmplt_epi32(a,b);
00192   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00193 }
00194 
00195 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
00196 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
00197 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
00198 {
00199   // after some bench, this version *is* faster than a scalar implementation
00200   Packet4i mask = _mm_cmpgt_epi32(a,b);
00201   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00202 }
00203 
00204 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
00205 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
00206 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
00207 
00208 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
00209 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
00210 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
00211 
00212 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
00213 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
00214 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
00215 
00216 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
00217 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
00218 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
00219 
00220 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
00221 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
00222 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
00223 
00224 #if defined(_MSC_VER)
00225   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
00226   template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
00227   template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
00228 #else
00229 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
00230 // require pointer casting to incompatible pointer types and leads to invalid code
00231 // because of the strict aliasing rule. The "dummy" stuff are required to enforce
00232 // a correct instruction dependency.
00233 // TODO: do the same for MSVC (ICC is compatible)
00234 // NOTE: with the code below, MSVC's compiler crashes!
00235 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00236 {
00237   EIGEN_DEBUG_UNALIGNED_LOAD
00238   __m128d res;
00239   res =  _mm_load_sd((const double*)(from)) ;
00240   res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
00241   return _mm_castpd_ps(res);
00242 }
00243 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
00244 {
00245   EIGEN_DEBUG_UNALIGNED_LOAD
00246   __m128d res;
00247   res = _mm_load_sd(from) ;
00248   res = _mm_loadh_pd(res,from+1);
00249   return res;
00250 }
00251 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00252 {
00253   EIGEN_DEBUG_UNALIGNED_LOAD
00254   __m128d res;
00255   res =  _mm_load_sd((const double*)(from)) ;
00256   res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
00257   return _mm_castpd_si128(res);
00258 }
00259 #endif
00260 
00261 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
00262 {
00263   return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd((const double*)from)), 0, 0, 1, 1);
00264 }
00265 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
00266 { return pset1<Packet2d>(from[0]); }
00267 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
00268 {
00269   Packet4i tmp;
00270   tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
00271   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
00272 }
00273 
00274 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
00275 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
00276 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
00277 
00278 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
00279   EIGEN_DEBUG_UNALIGNED_STORE
00280   _mm_storel_pd((to), from);
00281   _mm_storeh_pd((to+1), from);
00282 }
00283 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castps_pd(from)); }
00284 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castsi128_pd(from)); }
00285 
00286 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00287 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00288 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00289 
00290 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) && !defined(__INTEL_COMPILER)
00291 // The temporary variable fixes an internal compilation error.
00292 // Direct of the struct members fixed bug #62.
00293 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
00294 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
00295 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00296 #elif defined(_MSC_VER) && (_MSC_VER <= 1500) && !defined(__INTEL_COMPILER)
00297 // The temporary variable fixes an internal compilation error.
00298 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
00299 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
00300 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00301 #else
00302 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
00303 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
00304 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
00305 #endif
00306 
00307 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
00308 { return _mm_shuffle_ps(a,a,0x1B); }
00309 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
00310 { return _mm_shuffle_pd(a,a,0x1); }
00311 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
00312 { return _mm_shuffle_epi32(a,0x1B); }
00313 
00314 
00315 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
00316 {
00317   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
00318   return _mm_and_ps(a,mask);
00319 }
00320 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
00321 {
00322   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
00323   return _mm_and_pd(a,mask);
00324 }
00325 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
00326 {
00327   #ifdef EIGEN_VECTORIZE_SSSE3
00328   return _mm_abs_epi32(a);
00329   #else
00330   Packet4i aux = _mm_srai_epi32(a,31);
00331   return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
00332   #endif
00333 }
00334 
00335 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
00336 {
00337   vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
00338   vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
00339   vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
00340   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
00341 }
00342 
00343 #ifdef EIGEN_VECTORIZE_SSE3
00344 // TODO implement SSE2 versions as well as integer versions
00345 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00346 {
00347   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
00348 }
00349 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
00350 {
00351   return _mm_hadd_pd(vecs[0], vecs[1]);
00352 }
00353 // SSSE3 version:
00354 // EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
00355 // {
00356 //   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
00357 // }
00358 
00359 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00360 {
00361   Packet4f tmp0 = _mm_hadd_ps(a,a);
00362   return pfirst(_mm_hadd_ps(tmp0, tmp0));
00363 }
00364 
00365 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
00366 
00367 // SSSE3 version:
00368 // EIGEN_STRONG_INLINE float predux(const Packet4i& a)
00369 // {
00370 //   Packet4i tmp0 = _mm_hadd_epi32(a,a);
00371 //   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
00372 // }
00373 #else
00374 // SSE2 versions
00375 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00376 {
00377   Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
00378   return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00379 }
00380 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
00381 {
00382   return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
00383 }
00384 
00385 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00386 {
00387   Packet4f tmp0, tmp1, tmp2;
00388   tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
00389   tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
00390   tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
00391   tmp0 = _mm_add_ps(tmp0, tmp1);
00392   tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
00393   tmp1 = _mm_add_ps(tmp1, tmp2);
00394   tmp2 = _mm_movehl_ps(tmp1, tmp0);
00395   tmp0 = _mm_movelh_ps(tmp0, tmp1);
00396   return _mm_add_ps(tmp0, tmp2);
00397 }
00398 
00399 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
00400 {
00401   return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
00402 }
00403 #endif  // SSE3
00404 
00405 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00406 {
00407   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
00408   return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
00409 }
00410 
00411 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00412 {
00413   Packet4i tmp0, tmp1, tmp2;
00414   tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
00415   tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
00416   tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
00417   tmp0 = _mm_add_epi32(tmp0, tmp1);
00418   tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
00419   tmp1 = _mm_add_epi32(tmp1, tmp2);
00420   tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
00421   tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
00422   return _mm_add_epi32(tmp0, tmp2);
00423 }
00424 
00425 // Other reduction functions:
00426 
00427 // mul
00428 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00429 {
00430   Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
00431   return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00432 }
00433 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
00434 {
00435   return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
00436 }
00437 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00438 {
00439   // after some experiments, it is seems this is the fastest way to implement it
00440   // for GCC (eg., reusing pmul is very slow !)
00441   // TODO try to call _mm_mul_epu32 directly
00442   EIGEN_ALIGN16 int aux[4];
00443   pstore(aux, a);
00444   return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
00445 }
00446 
00447 // min
00448 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00449 {
00450   Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
00451   return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00452 }
00453 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
00454 {
00455   return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
00456 }
00457 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00458 {
00459   // after some experiments, it is seems this is the fastest way to implement it
00460   // for GCC (eg., it does not like using std::min after the pstore !!)
00461   EIGEN_ALIGN16 int aux[4];
00462   pstore(aux, a);
00463   register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
00464   register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
00465   return aux0<aux2 ? aux0 : aux2;
00466 }
00467 
00468 // max
00469 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00470 {
00471   Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
00472   return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00473 }
00474 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
00475 {
00476   return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
00477 }
00478 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00479 {
00480   // after some experiments, it is seems this is the fastest way to implement it
00481   // for GCC (eg., it does not like using std::min after the pstore !!)
00482   EIGEN_ALIGN16 int aux[4];
00483   pstore(aux, a);
00484   register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
00485   register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
00486   return aux0>aux2 ? aux0 : aux2;
00487 }
00488 
00489 #if (defined __GNUC__)
00490 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
00491 // {
00492 //   Packet4f res = b;
00493 //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
00494 //   return res;
00495 // }
00496 // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
00497 // {
00498 //   Packet4i res = a;
00499 //   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
00500 //   return res;
00501 // }
00502 #endif
00503 
00504 #ifdef EIGEN_VECTORIZE_SSSE3
00505 // SSSE3 versions
00506 template<int Offset>
00507 struct palign_impl<Offset,Packet4f>
00508 {
00509   EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
00510   {
00511     if (Offset!=0)
00512       first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
00513   }
00514 };
00515 
00516 template<int Offset>
00517 struct palign_impl<Offset,Packet4i>
00518 {
00519   EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
00520   {
00521     if (Offset!=0)
00522       first = _mm_alignr_epi8(second,first, Offset*4);
00523   }
00524 };
00525 
00526 template<int Offset>
00527 struct palign_impl<Offset,Packet2d>
00528 {
00529   EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)
00530   {
00531     if (Offset==1)
00532       first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
00533   }
00534 };
00535 #else
00536 // SSE2 versions
00537 template<int Offset>
00538 struct palign_impl<Offset,Packet4f>
00539 {
00540   EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
00541   {
00542     if (Offset==1)
00543     {
00544       first = _mm_move_ss(first,second);
00545       first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
00546     }
00547     else if (Offset==2)
00548     {
00549       first = _mm_movehl_ps(first,first);
00550       first = _mm_movelh_ps(first,second);
00551     }
00552     else if (Offset==3)
00553     {
00554       first = _mm_move_ss(first,second);
00555       first = _mm_shuffle_ps(first,second,0x93);
00556     }
00557   }
00558 };
00559 
00560 template<int Offset>
00561 struct palign_impl<Offset,Packet4i>
00562 {
00563   EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
00564   {
00565     if (Offset==1)
00566     {
00567       first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00568       first = _mm_shuffle_epi32(first,0x39);
00569     }
00570     else if (Offset==2)
00571     {
00572       first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
00573       first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00574     }
00575     else if (Offset==3)
00576     {
00577       first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00578       first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
00579     }
00580   }
00581 };
00582 
00583 template<int Offset>
00584 struct palign_impl<Offset,Packet2d>
00585 {
00586   EIGEN_STRONG_INLINE static void run(Packet2d& first, const Packet2d& second)
00587   {
00588     if (Offset==1)
00589     {
00590       first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
00591       first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
00592     }
00593   }
00594 };
00595 #endif
00596 
00597 } // end namespace internal
00598 
00599 #endif // EIGEN_PACKET_MATH_SSE_H



Page generated by Doxygen 1.7.3 for MRPT 0.9.4 SVN:exported at Tue Jan 25 21:56:31 UTC 2011