12#ifndef EIGEN_PACKET_MATH_NEON_H
13#define EIGEN_PACKET_MATH_NEON_H
19#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
28#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
31#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
33#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
35#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
46template<
typename T,
int unique_
id>
47struct eigen_packet_wrapper
49 operator T&() {
return m_val; }
50 operator const T&()
const {
return m_val; }
51 eigen_packet_wrapper() {}
52 eigen_packet_wrapper(
const T &v) : m_val(v) {}
53 eigen_packet_wrapper& operator=(
const T &v) {
60typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
61typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
62typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
63typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
64typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
68typedef float32x2_t Packet2f;
69typedef float32x4_t Packet4f;
70typedef int32x4_t Packet4i;
71typedef int32x2_t Packet2i;
72typedef uint32x4_t Packet4ui;
76#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
77 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
79#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
80 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
82#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
83 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
89 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
90#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
91 #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
93 #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
95 #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
98 #define EIGEN_ARM_PREFETCH(ADDR)
101template<>
struct packet_traits<float> : default_packet_traits
103 typedef Packet4f type;
104 typedef Packet4f half;
120template<>
struct packet_traits<int32_t> : default_packet_traits
122 typedef Packet4i type;
123 typedef Packet4i half;
133#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
135EIGEN_STRONG_INLINE float32x4_t vld1q_f32(
const float* x) { return ::vld1q_f32((
const float32_t*)x); }
136EIGEN_STRONG_INLINE float32x2_t vld1_f32 (
const float* x) { return ::vld1_f32 ((
const float32_t*)x); }
137EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (
const float* x) { return ::vld1_dup_f32 ((
const float32_t*)x); }
138EIGEN_STRONG_INLINE
void vst1q_f32(
float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
139EIGEN_STRONG_INLINE
void vst1_f32 (
float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
142template<>
struct unpacket_traits<Packet4f> {
typedef float type;
enum {size=4, alignment=
Aligned16};
typedef Packet4f half; };
143template<>
struct unpacket_traits<Packet4i> {
typedef int32_t type;
enum {size=4, alignment=
Aligned16};
typedef Packet4i half; };
145template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
return vdupq_n_f32(from); }
146template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
return vdupq_n_s32(from); }
148template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a)
150 const float f[] = {0, 1, 2, 3};
151 Packet4f countdown = vld1q_f32(f);
152 return vaddq_f32(pset1<Packet4f>(a), countdown);
154template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a)
156 const int32_t i[] = {0, 1, 2, 3};
157 Packet4i countdown = vld1q_s32(i);
158 return vaddq_s32(pset1<Packet4i>(a), countdown);
161template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vaddq_f32(a,b); }
162template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vaddq_s32(a,b); }
164template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vsubq_f32(a,b); }
165template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vsubq_s32(a,b); }
167template<> EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
return vnegq_f32(a); }
168template<> EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
return vnegq_s32(a); }
170template<> EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
return a; }
171template<> EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
return a; }
173template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vmulq_f32(a,b); }
174template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vmulq_s32(a,b); }
176EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
177 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
180EIGEN_STRONG_INLINE Packet4f pcmp_le(
const Packet4f& a,
const Packet4f& b) {
181 return vreinterpretq_f32_u32(vcleq_f32(a, b));
184EIGEN_STRONG_INLINE Packet4f preciprocal(
const Packet4f& a)
187 float32x4_t result = vrecpeq_f32(a);
188 result = vmulq_f32(vrecpsq_f32(a, result), result);
189 result = vmulq_f32(vrecpsq_f32(a, result), result);
194template<> EIGEN_STRONG_INLINE Packet4f pdiv(
const Packet4f& a,
const Packet4f& b) {
return vdivq_f32(a, b); }
195template<> EIGEN_STRONG_INLINE Packet2f pdiv(
const Packet2f& a,
const Packet2f& b) {
return vdiv_f32(a, b); }
197template<
typename Packet>
198EIGEN_STRONG_INLINE Packet pdiv_float_common(
const Packet& a,
const Packet& b) {
202 const Packet cst_one = pset1<Packet>(1.0f);
203 const Packet cst_quarter = pset1<Packet>(0.25f);
204 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
206 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
207 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
208 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
212template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
213 return pdiv_float_common(a, b);
218template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& ,
const Packet4i& )
219{ eigen_assert(
false &&
"packet integer division are not supported by NEON");
220 return pset1<Packet4i>(0);
227#if (defined EIGEN_VECTORIZE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM)
234template<> EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
return vfmaq_f32(c,a,b); }
236template<> EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
237#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
247 "vmla.f32 %q[r], %q[a], %q[b]"
254 return vmlaq_f32(c,a,b);
260template<> EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
return vmlaq_s32(c,a,b); }
262template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vminq_f32(a,b); }
263template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vminq_s32(a,b); }
265template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
return vmaxq_f32(a,b); }
266template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vmaxq_s32(a,b); }
269template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b)
271 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
273template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vandq_s32(a,b); }
275template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b)
277 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
279template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vorrq_s32(a,b); }
281template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b)
283 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
285template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return veorq_s32(a,b); }
287template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b)
289 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
291template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
return vbicq_s32(a,b); }
293template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) { EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f32(from); }
294template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_s32(from); }
296template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f32(from); }
297template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_s32(from); }
299template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from)
302 lo = vld1_dup_f32(from);
303 hi = vld1_dup_f32(from+1);
304 return vcombine_f32(lo, hi);
306template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from)
309 lo = vld1_dup_s32(from);
310 hi = vld1_dup_s32(from+1);
311 return vcombine_s32(lo, hi);
314template<> EIGEN_STRONG_INLINE
void pstore<float> (
float* to,
const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
315template<> EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
317template<> EIGEN_STRONG_INLINE
void pstoreu<float> (
float* to,
const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
318template<> EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
320template<> EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride)
322 Packet4f res = pset1<Packet4f>(0.f);
323 res = vsetq_lane_f32(from[0*stride], res, 0);
324 res = vsetq_lane_f32(from[1*stride], res, 1);
325 res = vsetq_lane_f32(from[2*stride], res, 2);
326 res = vsetq_lane_f32(from[3*stride], res, 3);
329template<> EIGEN_DEVICE_FUNC
inline Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride)
331 Packet4i res = pset1<Packet4i>(0);
332 res = vsetq_lane_s32(from[0*stride], res, 0);
333 res = vsetq_lane_s32(from[1*stride], res, 1);
334 res = vsetq_lane_s32(from[2*stride], res, 2);
335 res = vsetq_lane_s32(from[3*stride], res, 3);
339template<> EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride)
341 to[stride*0] = vgetq_lane_f32(from, 0);
342 to[stride*1] = vgetq_lane_f32(from, 1);
343 to[stride*2] = vgetq_lane_f32(from, 2);
344 to[stride*3] = vgetq_lane_f32(from, 3);
346template<> EIGEN_DEVICE_FUNC
inline void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
Index stride)
348 to[stride*0] = vgetq_lane_s32(from, 0);
349 to[stride*1] = vgetq_lane_s32(from, 1);
350 to[stride*2] = vgetq_lane_s32(from, 2);
351 to[stride*3] = vgetq_lane_s32(from, 3);
354template<> EIGEN_STRONG_INLINE
void prefetch<float> (
const float* addr) { EIGEN_ARM_PREFETCH(addr); }
355template<> EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
358template<> EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a);
return x[0]; }
359template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a);
return x[0]; }
361template<> EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
362 float32x2_t a_lo, a_hi;
365 a_r64 = vrev64q_f32(a);
366 a_lo = vget_low_f32(a_r64);
367 a_hi = vget_high_f32(a_r64);
368 return vcombine_f32(a_hi, a_lo);
370template<> EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
371 int32x2_t a_lo, a_hi;
374 a_r64 = vrev64q_s32(a);
375 a_lo = vget_low_s32(a_r64);
376 a_hi = vget_high_s32(a_r64);
377 return vcombine_s32(a_hi, a_lo);
380template<> EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
return vabsq_f32(a); }
381template<> EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
return vabsq_s32(a); }
383template<> EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a)
385 float32x2_t a_lo, a_hi, sum;
387 a_lo = vget_low_f32(a);
388 a_hi = vget_high_f32(a);
389 sum = vpadd_f32(a_lo, a_hi);
390 sum = vpadd_f32(sum, sum);
391 return vget_lane_f32(sum, 0);
394template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(
const Packet4f* vecs)
396 float32x4x2_t vtrn1, vtrn2, res1, res2;
397 Packet4f sum1, sum2, sum;
401 vtrn1 = vzipq_f32(vecs[0], vecs[2]);
402 vtrn2 = vzipq_f32(vecs[1], vecs[3]);
403 res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
404 res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
407 sum1 = vaddq_f32(res1.val[0], res1.val[1]);
408 sum2 = vaddq_f32(res2.val[0], res2.val[1]);
409 sum = vaddq_f32(sum1, sum2);
414template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a)
416 int32x2_t a_lo, a_hi, sum;
418 a_lo = vget_low_s32(a);
419 a_hi = vget_high_s32(a);
420 sum = vpadd_s32(a_lo, a_hi);
421 sum = vpadd_s32(sum, sum);
422 return vget_lane_s32(sum, 0);
425template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(
const Packet4i* vecs)
427 int32x4x2_t vtrn1, vtrn2, res1, res2;
428 Packet4i sum1, sum2, sum;
432 vtrn1 = vzipq_s32(vecs[0], vecs[2]);
433 vtrn2 = vzipq_s32(vecs[1], vecs[3]);
434 res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
435 res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
438 sum1 = vaddq_s32(res1.val[0], res1.val[1]);
439 sum2 = vaddq_s32(res2.val[0], res2.val[1]);
440 sum = vaddq_s32(sum1, sum2);
447template<> EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a)
449 float32x2_t a_lo, a_hi, prod;
452 a_lo = vget_low_f32(a);
453 a_hi = vget_high_f32(a);
455 prod = vmul_f32(a_lo, a_hi);
457 prod = vmul_f32(prod, vrev64_f32(prod));
459 return vget_lane_f32(prod, 0);
461template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a)
463 int32x2_t a_lo, a_hi, prod;
466 a_lo = vget_low_s32(a);
467 a_hi = vget_high_s32(a);
469 prod = vmul_s32(a_lo, a_hi);
471 prod = vmul_s32(prod, vrev64_s32(prod));
473 return vget_lane_s32(prod, 0);
477template<> EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a)
479 float32x2_t a_lo, a_hi, min;
481 a_lo = vget_low_f32(a);
482 a_hi = vget_high_f32(a);
483 min = vpmin_f32(a_lo, a_hi);
484 min = vpmin_f32(min, min);
486 return vget_lane_f32(min, 0);
489template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a)
491 int32x2_t a_lo, a_hi, min;
493 a_lo = vget_low_s32(a);
494 a_hi = vget_high_s32(a);
495 min = vpmin_s32(a_lo, a_hi);
496 min = vpmin_s32(min, min);
498 return vget_lane_s32(min, 0);
502template<> EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a)
504 float32x2_t a_lo, a_hi, max;
506 a_lo = vget_low_f32(a);
507 a_hi = vget_high_f32(a);
508 max = vpmax_f32(a_lo, a_hi);
509 max = vpmax_f32(max, max);
511 return vget_lane_f32(max, 0);
514template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a)
516 int32x2_t a_lo, a_hi, max;
518 a_lo = vget_low_s32(a);
519 a_hi = vget_high_s32(a);
520 max = vpmax_s32(a_lo, a_hi);
521 max = vpmax_s32(max, max);
523 return vget_lane_s32(max, 0);
528#define PALIGN_NEON(Offset,Type,Command) \
530struct palign_impl<Offset,Type>\
532 EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
535 first = Command(first, second, Offset);\
539PALIGN_NEON(0,Packet4f,vextq_f32)
540PALIGN_NEON(1,Packet4f,vextq_f32)
541PALIGN_NEON(2,Packet4f,vextq_f32)
542PALIGN_NEON(3,Packet4f,vextq_f32)
543PALIGN_NEON(0,Packet4i,vextq_s32)
544PALIGN_NEON(1,Packet4i,vextq_s32)
545PALIGN_NEON(2,Packet4i,vextq_s32)
546PALIGN_NEON(3,Packet4i,vextq_s32)
550EIGEN_DEVICE_FUNC
inline void
551ptranspose(PacketBlock<Packet4f,4>& kernel) {
552 float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
553 float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
555 kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
556 kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
557 kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
558 kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
561EIGEN_DEVICE_FUNC
inline void
562ptranspose(PacketBlock<Packet4i,4>& kernel) {
563 int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
564 int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
565 kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
566 kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
567 kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
568 kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
575#ifdef __apple_build_version__
579#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
581#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
584#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
591uint64x2_t vreinterpretq_u64_f64(T a)
593 return (uint64x2_t) a;
597float64x2_t vreinterpretq_f64_u64(T a)
599 return (float64x2_t) a;
602typedef float64x2_t Packet2d;
603typedef float64x1_t Packet1d;
605template<>
struct packet_traits<double> : default_packet_traits
607 typedef Packet2d type;
608 typedef Packet2d half;
625template<>
struct unpacket_traits<Packet2d> {
typedef double type;
enum {size=2, alignment=
Aligned16};
typedef Packet2d half; };
627template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
return vdupq_n_f64(from); }
629template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a)
631 const double countdown_raw[] = {0.0,1.0};
632 const Packet2d countdown = vld1q_f64(countdown_raw);
633 return vaddq_f64(pset1<Packet2d>(a), countdown);
635template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vaddq_f64(a,b); }
637template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vsubq_f64(a,b); }
639template<> EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
return vnegq_f64(a); }
641template<> EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
return a; }
643template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vmulq_f64(a,b); }
645template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vdivq_f64(a,b); }
647#ifdef EIGEN_VECTORIZE_FMA
649template<> EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
return vfmaq_f64(c,a,b); }
651template<> EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
return vmlaq_f64(c,a,b); }
654template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vminq_f64(a,b); }
656template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
return vmaxq_f64(a,b); }
659template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b)
661 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
664template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b)
666 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
669template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b)
671 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
674template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b)
676 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
679template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) { EIGEN_DEBUG_ALIGNED_LOAD
return vld1q_f64(from); }
681template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD
return vld1q_f64(from); }
683template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from)
685 return vld1q_dup_f64(from);
687template<> EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
689template<> EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
691template<> EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from,
Index stride)
693 Packet2d res = pset1<Packet2d>(0.0);
694 res = vsetq_lane_f64(from[0*stride], res, 0);
695 res = vsetq_lane_f64(from[1*stride], res, 1);
698template<> EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride)
700 to[stride*0] = vgetq_lane_f64(from, 0);
701 to[stride*1] = vgetq_lane_f64(from, 1);
703template<> EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) { EIGEN_ARM_PREFETCH(addr); }
706template<> EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
return vgetq_lane_f64(a, 0); }
708template<> EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
710template<> EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
return vabsq_f64(a); }
712#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
714template<> EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
return (vget_low_f64(a) + vget_high_f64(a))[0]; }
716template<> EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
719template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(
const Packet2d* vecs)
721 float64x2_t trn1, trn2;
725 trn1 = vzip1q_f64(vecs[0], vecs[1]);
726 trn2 = vzip2q_f64(vecs[0], vecs[1]);
729 return vaddq_f64(trn1, trn2);
733#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
734template<> EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
return (vget_low_f64(a) * vget_high_f64(a))[0]; }
736template<> EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
740template<> EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
return vgetq_lane_f64(vpminq_f64(a, a), 0); }
743template<> EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
747#define PALIGN_NEON(Offset,Type,Command) \
749struct palign_impl<Offset,Type>\
751 EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
754 first = Command(first, second, Offset);\
758PALIGN_NEON(0,Packet2d,vextq_f64)
759PALIGN_NEON(1,Packet2d,vextq_f64)
762EIGEN_DEVICE_FUNC
inline void
763ptranspose(PacketBlock<Packet2d,2>& kernel) {
764 float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
765 float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
767 kernel.packet[0] = trn1;
768 kernel.packet[1] = trn2;
@ Aligned16
Definition Constants.h:230
Namespace containing all symbols from the Eigen library.
Definition A05_PortingFrom2To3.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:65