Eigen  3.3.9
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6// Heavily based on Gael's SSE version.
7//
8// This Source Code Form is subject to the terms of the Mozilla
9// Public License v. 2.0. If a copy of the MPL was not distributed
10// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
12#ifndef EIGEN_PACKET_MATH_NEON_H
13#define EIGEN_PACKET_MATH_NEON_H
14
15namespace Eigen {
16
17namespace internal {
18
19#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
21#endif
22
23#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#endif
26
27#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
28#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
29#endif
30
31#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32#if EIGEN_ARCH_ARM64
33#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
34#else
35#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
36#endif
37#endif
38
39#if EIGEN_COMP_MSVC
40
41// In MSVC's arm_neon.h header file, all NEON vector types
42// are aliases to the same underlying type __n128.
43// We thus have to wrap them to make them different C++ types.
44// (See also bug 1428)
45
46template<typename T,int unique_id>
47struct eigen_packet_wrapper
48{
49 operator T&() { return m_val; }
50 operator const T&() const { return m_val; }
51 eigen_packet_wrapper() {}
52 eigen_packet_wrapper(const T &v) : m_val(v) {}
53 eigen_packet_wrapper& operator=(const T &v) {
54 m_val = v;
55 return *this;
56 }
57
58 T m_val;
59};
60typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
61typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
62typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
63typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
64typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
65
66#else
67
68typedef float32x2_t Packet2f;
69typedef float32x4_t Packet4f;
70typedef int32x4_t Packet4i;
71typedef int32x2_t Packet2i;
72typedef uint32x4_t Packet4ui;
73
74#endif // EIGEN_COMP_MSVC
75
76#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
77 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
78
79#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
80 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
81
82#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
83 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
84
85#if EIGEN_ARCH_ARM64
86 // __builtin_prefetch tends to do nothing on ARM64 compilers because the
87 // prefetch instructions there are too detailed for __builtin_prefetch to map
88 // meaningfully to them.
89 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
90#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
91 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
92#elif defined __pld
93 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
94#elif EIGEN_ARCH_ARM32
95 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
96#else
97 // by default no explicit prefetching
98 #define EIGEN_ARM_PREFETCH(ADDR)
99#endif
100
101template<> struct packet_traits<float> : default_packet_traits
102{
103 typedef Packet4f type;
104 typedef Packet4f half; // Packet2f intrinsics not implemented yet
105 enum {
106 Vectorizable = 1,
107 AlignedOnScalar = 1,
108 size = 4,
109 HasHalfPacket=0, // Packet2f intrinsics not implemented yet
110
111 HasDiv = 1,
112 // FIXME check the Has*
113 HasSin = 0,
114 HasCos = 0,
115 HasLog = 0,
116 HasExp = 1,
117 HasSqrt = 0
118 };
119};
120template<> struct packet_traits<int32_t> : default_packet_traits
121{
122 typedef Packet4i type;
123 typedef Packet4i half; // Packet2i intrinsics not implemented yet
124 enum {
125 Vectorizable = 1,
126 AlignedOnScalar = 1,
127 size=4,
128 HasHalfPacket=0 // Packet2i intrinsics not implemented yet
129 // FIXME check the Has*
130 };
131};
132
133#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
134// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
135EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
136EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
137EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
138EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
139EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
140#endif
141
142template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
143template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
144
145template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
146template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
147
148template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
149{
150 const float f[] = {0, 1, 2, 3};
151 Packet4f countdown = vld1q_f32(f);
152 return vaddq_f32(pset1<Packet4f>(a), countdown);
153}
154template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
155{
156 const int32_t i[] = {0, 1, 2, 3};
157 Packet4i countdown = vld1q_s32(i);
158 return vaddq_s32(pset1<Packet4i>(a), countdown);
159}
160
161template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
162template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
163
164template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
165template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
166
167template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
168template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
169
170template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
171template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
172
173template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
174template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
175
176EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
177 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
178}
179
180EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
181 return vreinterpretq_f32_u32(vcleq_f32(a, b));
182}
183
184EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
185{
186 // Compute approximate reciprocal.
187 float32x4_t result = vrecpeq_f32(a);
188 result = vmulq_f32(vrecpsq_f32(a, result), result);
189 result = vmulq_f32(vrecpsq_f32(a, result), result);
190 return result;
191}
192
193#if EIGEN_ARCH_ARM64
194template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
195template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
196#else
197template<typename Packet>
198EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
199 // if b is large, NEON intrinsics will flush preciprocal(b) to zero
200 // avoid underflow with the following manipulation:
201 // a / b = f * (a * reciprocal(f * b))
202 const Packet cst_one = pset1<Packet>(1.0f);
203 const Packet cst_quarter = pset1<Packet>(0.25f);
204 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
205
206 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
207 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
208 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
209 return result;
210}
211
212template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
213 return pdiv_float_common(a, b);
214}
215
216#endif
217
218template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
219{ eigen_assert(false && "packet integer division are not supported by NEON");
220 return pset1<Packet4i>(0);
221}
222
223// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
224// then implements a slow software scalar fallback calling fmaf()!
225// Filed LLVM bug:
226// https://llvm.org/bugs/show_bug.cgi?id=27216
227#if (defined EIGEN_VECTORIZE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
228// See bug 936.
229// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
230// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
231// MLA is not fused i.e. does 2 roundings.
232// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
233// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
234template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
235#else
236template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
237#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
238 // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
239 // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
240 // -march=armv7-a, that is a very common case.
241 // See e.g. this thread:
242 // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html
243 // Filed LLVM bug:
244 // https://llvm.org/bugs/show_bug.cgi?id=27219
245 Packet4f r = c;
246 asm volatile(
247 "vmla.f32 %q[r], %q[a], %q[b]"
248 : [r] "+w" (r)
249 : [a] "w" (a),
250 [b] "w" (b)
251 : );
252 return r;
253#else
254 return vmlaq_f32(c,a,b);
255#endif
256}
257#endif
258
259// No FMA instruction for int, so use MLA unconditionally.
260template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
261
262template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
263template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
264
265template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
266template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
267
268// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
269template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
270{
271 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
272}
273template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
274
275template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
276{
277 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
278}
279template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
280
281template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
282{
283 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
284}
285template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
286
287template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
288{
289 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
290}
291template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
292
293template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
294template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
295
296template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
297template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
298
299template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
300{
301 float32x2_t lo, hi;
302 lo = vld1_dup_f32(from);
303 hi = vld1_dup_f32(from+1);
304 return vcombine_f32(lo, hi);
305}
306template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
307{
308 int32x2_t lo, hi;
309 lo = vld1_dup_s32(from);
310 hi = vld1_dup_s32(from+1);
311 return vcombine_s32(lo, hi);
312}
313
314template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
315template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
316
317template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
318template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
319
320template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
321{
322 Packet4f res = pset1<Packet4f>(0.f);
323 res = vsetq_lane_f32(from[0*stride], res, 0);
324 res = vsetq_lane_f32(from[1*stride], res, 1);
325 res = vsetq_lane_f32(from[2*stride], res, 2);
326 res = vsetq_lane_f32(from[3*stride], res, 3);
327 return res;
328}
329template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
330{
331 Packet4i res = pset1<Packet4i>(0);
332 res = vsetq_lane_s32(from[0*stride], res, 0);
333 res = vsetq_lane_s32(from[1*stride], res, 1);
334 res = vsetq_lane_s32(from[2*stride], res, 2);
335 res = vsetq_lane_s32(from[3*stride], res, 3);
336 return res;
337}
338
339template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
340{
341 to[stride*0] = vgetq_lane_f32(from, 0);
342 to[stride*1] = vgetq_lane_f32(from, 1);
343 to[stride*2] = vgetq_lane_f32(from, 2);
344 to[stride*3] = vgetq_lane_f32(from, 3);
345}
346template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
347{
348 to[stride*0] = vgetq_lane_s32(from, 0);
349 to[stride*1] = vgetq_lane_s32(from, 1);
350 to[stride*2] = vgetq_lane_s32(from, 2);
351 to[stride*3] = vgetq_lane_s32(from, 3);
352}
353
354template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); }
355template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
356
357// FIXME only store the 2 first elements ?
358template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
359template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
360
361template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
362 float32x2_t a_lo, a_hi;
363 Packet4f a_r64;
364
365 a_r64 = vrev64q_f32(a);
366 a_lo = vget_low_f32(a_r64);
367 a_hi = vget_high_f32(a_r64);
368 return vcombine_f32(a_hi, a_lo);
369}
370template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
371 int32x2_t a_lo, a_hi;
372 Packet4i a_r64;
373
374 a_r64 = vrev64q_s32(a);
375 a_lo = vget_low_s32(a_r64);
376 a_hi = vget_high_s32(a_r64);
377 return vcombine_s32(a_hi, a_lo);
378}
379
380template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
381template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
382
383template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
384{
385 float32x2_t a_lo, a_hi, sum;
386
387 a_lo = vget_low_f32(a);
388 a_hi = vget_high_f32(a);
389 sum = vpadd_f32(a_lo, a_hi);
390 sum = vpadd_f32(sum, sum);
391 return vget_lane_f32(sum, 0);
392}
393
394template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
395{
396 float32x4x2_t vtrn1, vtrn2, res1, res2;
397 Packet4f sum1, sum2, sum;
398
399 // NEON zip performs interleaving of the supplied vectors.
400 // We perform two interleaves in a row to acquire the transposed vector
401 vtrn1 = vzipq_f32(vecs[0], vecs[2]);
402 vtrn2 = vzipq_f32(vecs[1], vecs[3]);
403 res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
404 res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
405
406 // Do the addition of the resulting vectors
407 sum1 = vaddq_f32(res1.val[0], res1.val[1]);
408 sum2 = vaddq_f32(res2.val[0], res2.val[1]);
409 sum = vaddq_f32(sum1, sum2);
410
411 return sum;
412}
413
414template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
415{
416 int32x2_t a_lo, a_hi, sum;
417
418 a_lo = vget_low_s32(a);
419 a_hi = vget_high_s32(a);
420 sum = vpadd_s32(a_lo, a_hi);
421 sum = vpadd_s32(sum, sum);
422 return vget_lane_s32(sum, 0);
423}
424
425template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
426{
427 int32x4x2_t vtrn1, vtrn2, res1, res2;
428 Packet4i sum1, sum2, sum;
429
430 // NEON zip performs interleaving of the supplied vectors.
431 // We perform two interleaves in a row to acquire the transposed vector
432 vtrn1 = vzipq_s32(vecs[0], vecs[2]);
433 vtrn2 = vzipq_s32(vecs[1], vecs[3]);
434 res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
435 res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
436
437 // Do the addition of the resulting vectors
438 sum1 = vaddq_s32(res1.val[0], res1.val[1]);
439 sum2 = vaddq_s32(res2.val[0], res2.val[1]);
440 sum = vaddq_s32(sum1, sum2);
441
442 return sum;
443}
444
445// Other reduction functions:
446// mul
447template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
448{
449 float32x2_t a_lo, a_hi, prod;
450
451 // Get a_lo = |a1|a2| and a_hi = |a3|a4|
452 a_lo = vget_low_f32(a);
453 a_hi = vget_high_f32(a);
454 // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
455 prod = vmul_f32(a_lo, a_hi);
456 // Multiply prod with its swapped value |a2*a4|a1*a3|
457 prod = vmul_f32(prod, vrev64_f32(prod));
458
459 return vget_lane_f32(prod, 0);
460}
461template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
462{
463 int32x2_t a_lo, a_hi, prod;
464
465 // Get a_lo = |a1|a2| and a_hi = |a3|a4|
466 a_lo = vget_low_s32(a);
467 a_hi = vget_high_s32(a);
468 // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
469 prod = vmul_s32(a_lo, a_hi);
470 // Multiply prod with its swapped value |a2*a4|a1*a3|
471 prod = vmul_s32(prod, vrev64_s32(prod));
472
473 return vget_lane_s32(prod, 0);
474}
475
476// min
477template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
478{
479 float32x2_t a_lo, a_hi, min;
480
481 a_lo = vget_low_f32(a);
482 a_hi = vget_high_f32(a);
483 min = vpmin_f32(a_lo, a_hi);
484 min = vpmin_f32(min, min);
485
486 return vget_lane_f32(min, 0);
487}
488
489template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
490{
491 int32x2_t a_lo, a_hi, min;
492
493 a_lo = vget_low_s32(a);
494 a_hi = vget_high_s32(a);
495 min = vpmin_s32(a_lo, a_hi);
496 min = vpmin_s32(min, min);
497
498 return vget_lane_s32(min, 0);
499}
500
501// max
502template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
503{
504 float32x2_t a_lo, a_hi, max;
505
506 a_lo = vget_low_f32(a);
507 a_hi = vget_high_f32(a);
508 max = vpmax_f32(a_lo, a_hi);
509 max = vpmax_f32(max, max);
510
511 return vget_lane_f32(max, 0);
512}
513
514template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
515{
516 int32x2_t a_lo, a_hi, max;
517
518 a_lo = vget_low_s32(a);
519 a_hi = vget_high_s32(a);
520 max = vpmax_s32(a_lo, a_hi);
521 max = vpmax_s32(max, max);
522
523 return vget_lane_s32(max, 0);
524}
525
526// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
527// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
528#define PALIGN_NEON(Offset,Type,Command) \
529template<>\
530struct palign_impl<Offset,Type>\
531{\
532 EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
533 {\
534 if (Offset!=0)\
535 first = Command(first, second, Offset);\
536 }\
537};\
538
539PALIGN_NEON(0,Packet4f,vextq_f32)
540PALIGN_NEON(1,Packet4f,vextq_f32)
541PALIGN_NEON(2,Packet4f,vextq_f32)
542PALIGN_NEON(3,Packet4f,vextq_f32)
543PALIGN_NEON(0,Packet4i,vextq_s32)
544PALIGN_NEON(1,Packet4i,vextq_s32)
545PALIGN_NEON(2,Packet4i,vextq_s32)
546PALIGN_NEON(3,Packet4i,vextq_s32)
547
548#undef PALIGN_NEON
549
550EIGEN_DEVICE_FUNC inline void
551ptranspose(PacketBlock<Packet4f,4>& kernel) {
552 float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
553 float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
554
555 kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
556 kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
557 kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
558 kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
559}
560
561EIGEN_DEVICE_FUNC inline void
562ptranspose(PacketBlock<Packet4i,4>& kernel) {
563 int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
564 int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
565 kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
566 kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
567 kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
568 kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
569}
570
571//---------- double ----------
572
573// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
574// Confirmed at least with __apple_build_version__ = 6000054.
575#ifdef __apple_build_version__
576// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
577// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
578// major toolchain updates.
579#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
580#else
581#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
582#endif
583
584#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
585
586// Bug 907: workaround missing declarations of the following two functions in the ADK
587// Defining these functions as templates ensures that if these intrinsics are
588// already defined in arm_neon.h, then our workaround doesn't cause a conflict
589// and has lower priority in overload resolution.
590template <typename T>
591uint64x2_t vreinterpretq_u64_f64(T a)
592{
593 return (uint64x2_t) a;
594}
595
596template <typename T>
597float64x2_t vreinterpretq_f64_u64(T a)
598{
599 return (float64x2_t) a;
600}
601
602typedef float64x2_t Packet2d;
603typedef float64x1_t Packet1d;
604
605template<> struct packet_traits<double> : default_packet_traits
606{
607 typedef Packet2d type;
608 typedef Packet2d half;
609 enum {
610 Vectorizable = 1,
611 AlignedOnScalar = 1,
612 size = 2,
613 HasHalfPacket=0,
614
615 HasDiv = 1,
616 // FIXME check the Has*
617 HasSin = 0,
618 HasCos = 0,
619 HasLog = 0,
620 HasExp = 0,
621 HasSqrt = 0
622 };
623};
624
625template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
626
627template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
628
629template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
630{
631 const double countdown_raw[] = {0.0,1.0};
632 const Packet2d countdown = vld1q_f64(countdown_raw);
633 return vaddq_f64(pset1<Packet2d>(a), countdown);
634}
635template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
636
637template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
638
639template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
640
641template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
642
643template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
644
645template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
646
647#ifdef EIGEN_VECTORIZE_FMA
648// See bug 936. See above comment about FMA for float.
649template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
650#else
651template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
652#endif
653
654template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
655
656template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
657
658// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
659template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
660{
661 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
662}
663
664template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
665{
666 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
667}
668
669template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
670{
671 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
672}
673
674template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
675{
676 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
677}
678
679template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
680
681template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
682
683template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
684{
685 return vld1q_dup_f64(from);
686}
687template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
688
689template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
690
691template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
692{
693 Packet2d res = pset1<Packet2d>(0.0);
694 res = vsetq_lane_f64(from[0*stride], res, 0);
695 res = vsetq_lane_f64(from[1*stride], res, 1);
696 return res;
697}
698template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
699{
700 to[stride*0] = vgetq_lane_f64(from, 0);
701 to[stride*1] = vgetq_lane_f64(from, 1);
702}
703template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
704
705// FIXME only store the 2 first elements ?
706template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
707
708template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
709
710template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
711
712#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
713// workaround ICE, see bug 907
714template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
715#else
716template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
717#endif
718
719template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
720{
721 float64x2_t trn1, trn2;
722
723 // NEON zip performs interleaving of the supplied vectors.
724 // We perform two interleaves in a row to acquire the transposed vector
725 trn1 = vzip1q_f64(vecs[0], vecs[1]);
726 trn2 = vzip2q_f64(vecs[0], vecs[1]);
727
728 // Do the addition of the resulting vectors
729 return vaddq_f64(trn1, trn2);
730}
731// Other reduction functions:
732// mul
733#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
734template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
735#else
736template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
737#endif
738
739// min
740template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
741
742// max
743template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
744
745// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
746// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
747#define PALIGN_NEON(Offset,Type,Command) \
748template<>\
749struct palign_impl<Offset,Type>\
750{\
751 EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
752 {\
753 if (Offset!=0)\
754 first = Command(first, second, Offset);\
755 }\
756};\
757
758PALIGN_NEON(0,Packet2d,vextq_f64)
759PALIGN_NEON(1,Packet2d,vextq_f64)
760#undef PALIGN_NEON
761
762EIGEN_DEVICE_FUNC inline void
763ptranspose(PacketBlock<Packet2d,2>& kernel) {
764 float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
765 float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
766
767 kernel.packet[0] = trn1;
768 kernel.packet[1] = trn2;
769}
770#endif // EIGEN_ARCH_ARM64
771
772} // end namespace internal
773
774} // end namespace Eigen
775
776#endif // EIGEN_PACKET_MATH_NEON_H
@ Aligned16
Definition Constants.h:230
Namespace containing all symbols from the Eigen library.
Definition A05_PortingFrom2To3.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:65