NEON/PacketMath.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 // Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
6 // Heavily based on Gael's SSE version.
7 //
8 // Eigen is free software; you can redistribute it and/or
9 // modify it under the terms of the GNU Lesser General Public
10 // License as published by the Free Software Foundation; either
11 // version 3 of the License, or (at your option) any later version.
12 //
13 // Alternatively, you can redistribute it and/or
14 // modify it under the terms of the GNU General Public License as
15 // published by the Free Software Foundation; either version 2 of
16 // the License, or (at your option) any later version.
17 //
18 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
19 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
21 // GNU General Public License for more details.
22 //
23 // You should have received a copy of the GNU Lesser General Public
24 // License and a copy of the GNU General Public License along with
25 // Eigen. If not, see <http://www.gnu.org/licenses/>.
26 
27 #ifndef EIGEN_PACKET_MATH_NEON_H
28 #define EIGEN_PACKET_MATH_NEON_H
29 
30 namespace Eigen {
31 
32 namespace internal {
33 
34 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
35 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
36 #endif
37 
38 // FIXME NEON has 16 quad registers, but since the current register allocator
39 // is so bad, it is much better to reduce it to 8
40 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
41 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
42 #endif
43 
44 typedef float32x4_t Packet4f;
45 typedef int32x4_t Packet4i;
46 typedef uint32x4_t Packet4ui;
47 
48 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
49  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
50 
51 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
52  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
53 
54 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
55  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
56 
57 #if defined(__llvm__) && !defined(__clang__)
58  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
59  #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}}
60  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
61 #else
62  //Default initializer for packets
63  #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y}
64  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
65 #endif
66 
67 #ifndef __pld
68 #define __pld(x) asm volatile ( " pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
69 #endif
70 
71 template<> struct packet_traits<float> : default_packet_traits
72 {
73  typedef Packet4f type;
74  enum {
75  Vectorizable = 1,
76  AlignedOnScalar = 1,
77  size = 4,
78 
79  HasDiv = 1,
80  // FIXME check the Has*
81  HasSin = 0,
82  HasCos = 0,
83  HasLog = 0,
84  HasExp = 0,
85  HasSqrt = 0
86  };
87 };
88 template<> struct packet_traits<int> : default_packet_traits
89 {
90  typedef Packet4i type;
91  enum {
92  Vectorizable = 1,
93  AlignedOnScalar = 1,
94  size=4
95  // FIXME check the Has*
96  };
97 };
98 
99 #if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
100 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
101 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
102 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
103 EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
104 EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
105 #endif
106 
107 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
108 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
109 
110 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
111 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); }
112 
113 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
114 {
115  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
116  return vaddq_f32(pset1<Packet4f>(a), countdown);
117 }
118 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
119 {
120  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
121  return vaddq_s32(pset1<Packet4i>(a), countdown);
122 }
123 
124 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
125 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
126 
127 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
128 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
129 
130 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
131 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
132 
133 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
134 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
135 
136 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
137 {
138  Packet4f inv, restep, div;
139 
140  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
141  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
142  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
143  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
144  // Newton-Raphson and vrecpsq_f32()
145  inv = vrecpeq_f32(b);
146 
147  // This returns a differential, by which we will have to multiply inv to get a better
148  // approximation of 1/b.
149  restep = vrecpsq_f32(b, inv);
150  inv = vmulq_f32(restep, inv);
151 
152  // Finally, multiply a by 1/b and get the wanted result of the division.
153  div = vmulq_f32(a, inv);
154 
155  return div;
156 }
157 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
158 { eigen_assert(false && "packet integer division are not supported by NEON");
159  return pset1<Packet4i>(0);
160 }
161 
162 // for some weird raisons, it has to be overloaded for packet of integers
163 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
164 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
165 
166 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
167 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
168 
169 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
170 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
171 
172 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
173 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
174 {
175  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
176 }
177 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
178 
179 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
180 {
181  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
182 }
183 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
184 
185 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
186 {
187  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
188 }
189 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
190 
191 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
192 {
193  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
194 }
195 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
196 
197 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
198 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
199 
200 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
201 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
202 
203 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
204 {
205  float32x2_t lo, hi;
206  lo = vdup_n_f32(*from);
207  hi = vdup_n_f32(*(from+1));
208  return vcombine_f32(lo, hi);
209 }
210 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
211 {
212  int32x2_t lo, hi;
213  lo = vdup_n_s32(*from);
214  hi = vdup_n_s32(*(from+1));
215  return vcombine_s32(lo, hi);
216 }
217 
218 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
219 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
220 
221 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
222 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
223 
224 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); }
225 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { __pld(addr); }
226 
227 // FIXME only store the 2 first elements ?
228 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
229 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
230 
231 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
232  float32x2_t a_lo, a_hi;
233  Packet4f a_r64;
234 
235  a_r64 = vrev64q_f32(a);
236  a_lo = vget_low_f32(a_r64);
237  a_hi = vget_high_f32(a_r64);
238  return vcombine_f32(a_hi, a_lo);
239 }
240 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
241  int32x2_t a_lo, a_hi;
242  Packet4i a_r64;
243 
244  a_r64 = vrev64q_s32(a);
245  a_lo = vget_low_s32(a_r64);
246  a_hi = vget_high_s32(a_r64);
247  return vcombine_s32(a_hi, a_lo);
248 }
249 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
250 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
251 
252 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
253 {
254  float32x2_t a_lo, a_hi, sum;
255  float s[2];
256 
257  a_lo = vget_low_f32(a);
258  a_hi = vget_high_f32(a);
259  sum = vpadd_f32(a_lo, a_hi);
260  sum = vpadd_f32(sum, sum);
261  vst1_f32(s, sum);
262 
263  return s[0];
264 }
265 
267 {
268  float32x4x2_t vtrn1, vtrn2, res1, res2;
269  Packet4f sum1, sum2, sum;
270 
271  // NEON zip performs interleaving of the supplied vectors.
272  // We perform two interleaves in a row to acquire the transposed vector
273  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
274  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
275  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
276  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
277 
278  // Do the addition of the resulting vectors
279  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
280  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
281  sum = vaddq_f32(sum1, sum2);
282 
283  return sum;
284 }
285 
286 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
287 {
288  int32x2_t a_lo, a_hi, sum;
289  int32_t s[2];
290 
291  a_lo = vget_low_s32(a);
292  a_hi = vget_high_s32(a);
293  sum = vpadd_s32(a_lo, a_hi);
294  sum = vpadd_s32(sum, sum);
295  vst1_s32(s, sum);
296 
297  return s[0];
298 }
299 
301 {
302  int32x4x2_t vtrn1, vtrn2, res1, res2;
303  Packet4i sum1, sum2, sum;
304 
305  // NEON zip performs interleaving of the supplied vectors.
306  // We perform two interleaves in a row to acquire the transposed vector
307  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
308  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
309  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
310  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
311 
312  // Do the addition of the resulting vectors
313  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
314  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
315  sum = vaddq_s32(sum1, sum2);
316 
317  return sum;
318 }
319 
320 // Other reduction functions:
321 // mul
322 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
323 {
324  float32x2_t a_lo, a_hi, prod;
325  float s[2];
326 
327  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
328  a_lo = vget_low_f32(a);
329  a_hi = vget_high_f32(a);
330  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
331  prod = vmul_f32(a_lo, a_hi);
332  // Multiply prod with its swapped value |a2*a4|a1*a3|
333  prod = vmul_f32(prod, vrev64_f32(prod));
334  vst1_f32(s, prod);
335 
336  return s[0];
337 }
338 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
339 {
340  int32x2_t a_lo, a_hi, prod;
341  int32_t s[2];
342 
343  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
344  a_lo = vget_low_s32(a);
345  a_hi = vget_high_s32(a);
346  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
347  prod = vmul_s32(a_lo, a_hi);
348  // Multiply prod with its swapped value |a2*a4|a1*a3|
349  prod = vmul_s32(prod, vrev64_s32(prod));
350  vst1_s32(s, prod);
351 
352  return s[0];
353 }
354 
355 // min
356 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
357 {
358  float32x2_t a_lo, a_hi, min;
359  float s[2];
360 
361  a_lo = vget_low_f32(a);
362  a_hi = vget_high_f32(a);
363  min = vpmin_f32(a_lo, a_hi);
364  min = vpmin_f32(min, min);
365  vst1_f32(s, min);
366 
367  return s[0];
368 }
369 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
370 {
371  int32x2_t a_lo, a_hi, min;
372  int32_t s[2];
373 
374  a_lo = vget_low_s32(a);
375  a_hi = vget_high_s32(a);
376  min = vpmin_s32(a_lo, a_hi);
377  min = vpmin_s32(min, min);
378  vst1_s32(s, min);
379 
380  return s[0];
381 }
382 
383 // max
384 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
385 {
386  float32x2_t a_lo, a_hi, max;
387  float s[2];
388 
389  a_lo = vget_low_f32(a);
390  a_hi = vget_high_f32(a);
391  max = vpmax_f32(a_lo, a_hi);
392  max = vpmax_f32(max, max);
393  vst1_f32(s, max);
394 
395  return s[0];
396 }
397 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
398 {
399  int32x2_t a_lo, a_hi, max;
400  int32_t s[2];
401 
402  a_lo = vget_low_s32(a);
403  a_hi = vget_high_s32(a);
404  max = vpmax_s32(a_lo, a_hi);
405  max = vpmax_s32(max, max);
406  vst1_s32(s, max);
407 
408  return s[0];
409 }
410 
411 // this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
412 // see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
413 #define PALIGN_NEON(Offset,Type,Command) \
414 template<>\
415 struct palign_impl<Offset,Type>\
416 {\
417  EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
418  {\
419  if (Offset!=0)\
420  first = Command(first, second, Offset);\
421  }\
422 };\
423 
424 PALIGN_NEON(0,Packet4f,vextq_f32)
425 PALIGN_NEON(1,Packet4f,vextq_f32)
426 PALIGN_NEON(2,Packet4f,vextq_f32)
427 PALIGN_NEON(3,Packet4f,vextq_f32)
428 PALIGN_NEON(0,Packet4i,vextq_s32)
429 PALIGN_NEON(1,Packet4i,vextq_s32)
430 PALIGN_NEON(2,Packet4i,vextq_s32)
431 PALIGN_NEON(3,Packet4i,vextq_s32)
432 
433 #undef PALIGN_NEON
434 
435 } // end namespace internal
436 
437 } // end namespace Eigen
438 
439 #endif // EIGEN_PACKET_MATH_NEON_H