Eigen  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6// Heavily based on Gael's SSE version.
7//
8// This Source Code Form is subject to the terms of the Mozilla
9// Public License v. 2.0. If a copy of the MPL was not distributed
10// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
12#ifndef EIGEN_PACKET_MATH_NEON_H
13#define EIGEN_PACKET_MATH_NEON_H
14
15// IWYU pragma: private
16#include "../../InternalHeaderCheck.h"
17
18namespace Eigen {
19
20namespace internal {
21
22#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
23#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
24#endif
25
26#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28#endif
29
30#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
31#if EIGEN_ARCH_ARM64
32#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33#else
34#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
35#endif
36#endif
37
38#if EIGEN_COMP_MSVC_STRICT
39
40// In MSVC's arm_neon.h header file, all NEON vector types
41// are aliases to the same underlying type __n128.
42// We thus have to wrap them to make them different C++ types.
43// (See also bug 1428)
44typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
45typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
46typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
47typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
48typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
49typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
50typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
51typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
52typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
53typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
54typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
55typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
56typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
57typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
58typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
59typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
60typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
61typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
62
63EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
64 float from[4] = {a, b, c, d};
65 return vld1q_f32(from);
66}
67
68EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
69 float from[2] = {a, b};
70 return vld1_f32(from);
71}
72
73#else
74
75typedef float32x2_t Packet2f;
76typedef float32x4_t Packet4f;
77typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
78typedef int8x8_t Packet8c;
79typedef int8x16_t Packet16c;
80typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
81typedef uint8x8_t Packet8uc;
82typedef uint8x16_t Packet16uc;
83typedef int16x4_t Packet4s;
84typedef int16x8_t Packet8s;
85typedef uint16x4_t Packet4us;
86typedef uint16x8_t Packet8us;
87typedef int32x2_t Packet2i;
88typedef int32x4_t Packet4i;
89typedef uint32x2_t Packet2ui;
90typedef uint32x4_t Packet4ui;
91typedef int64x2_t Packet2l;
92typedef uint64x2_t Packet2ul;
93
94EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
95EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
96
97#endif // EIGEN_COMP_MSVC_STRICT
98
99EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
100 const float* a = reinterpret_cast<const float*>(&m);
101 Packet4f res =
102 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
103 return res;
104}
105
106// functionally equivalent to _mm_shuffle_ps in SSE when interleave
107// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
108// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
109// to enable a shared implementation for fast inversion of matrices of size 4.
110template <bool interleave>
111EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
112 const float* a = reinterpret_cast<const float*>(&m);
113 const float* b = reinterpret_cast<const float*>(&n);
114 Packet4f res =
115 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
116 return res;
117}
118
119template <>
120EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
121 const float* a = reinterpret_cast<const float*>(&m);
122 const float* b = reinterpret_cast<const float*>(&n);
123 Packet4f res =
124 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
125 return res;
126}
127
128EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
129 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
130}
131
132EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
133 return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
134}
135EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
136 return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
137}
138EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
139 return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
140}
141EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
142 return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
143}
144EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
145 return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
146}
147EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
148 return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
149}
150#define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
151
152#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
153
154#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
155 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
156
157#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
158
159#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
160// __builtin_prefetch tends to do nothing on ARM64 compilers because the
161// prefetch instructions there are too detailed for __builtin_prefetch to map
162// meaningfully to them.
163#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
164#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
165#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
166#elif defined __pld
167#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
168#elif EIGEN_ARCH_ARM
169#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
170#else
171// by default no explicit prefetching
172#define EIGEN_ARM_PREFETCH(ADDR)
173#endif
174
175template <>
176struct packet_traits<float> : default_packet_traits {
177 typedef Packet4f type;
178 typedef Packet2f half;
179 enum {
180 Vectorizable = 1,
181 AlignedOnScalar = 1,
182 size = 4,
183
184 HasCmp = 1,
185 HasAdd = 1,
186 HasSub = 1,
187 HasShift = 1,
188 HasMul = 1,
189 HasNegate = 1,
190 HasAbs = 1,
191 HasArg = 0,
192 HasAbs2 = 1,
193 HasAbsDiff = 1,
194 HasMin = 1,
195 HasMax = 1,
196 HasConj = 1,
197 HasSetLinear = 1,
198 HasBlend = 0,
199 HasDiv = 1,
200 HasSin = EIGEN_FAST_MATH,
201 HasCos = EIGEN_FAST_MATH,
202 HasACos = 1,
203 HasASin = 1,
204 HasATan = 1,
205 HasATanh = 1,
206 HasLog = 1,
207 HasExp = 1,
208 HasPow = 1,
209 HasSqrt = 1,
210 HasRsqrt = 1,
211 HasCbrt = 1,
212 HasTanh = EIGEN_FAST_MATH,
213 HasErf = EIGEN_FAST_MATH,
214 HasErfc = EIGEN_FAST_MATH,
215 HasBessel = 0, // Issues with accuracy.
216 HasNdtri = 0
217 };
218};
219
220template <>
221struct packet_traits<int8_t> : default_packet_traits {
222 typedef Packet16c type;
223 typedef Packet8c half;
224 enum {
225 Vectorizable = 1,
226 AlignedOnScalar = 1,
227 size = 16,
228
229 HasCmp = 1,
230 HasAdd = 1,
231 HasSub = 1,
232 HasShift = 1,
233 HasMul = 1,
234 HasNegate = 1,
235 HasAbs = 1,
236 HasAbsDiff = 1,
237 HasArg = 0,
238 HasAbs2 = 1,
239 HasMin = 1,
240 HasMax = 1,
241 HasConj = 1,
242 HasSetLinear = 1,
243 HasBlend = 0
244 };
245};
246
247template <>
248struct packet_traits<uint8_t> : default_packet_traits {
249 typedef Packet16uc type;
250 typedef Packet8uc half;
251 enum {
252 Vectorizable = 1,
253 AlignedOnScalar = 1,
254 size = 16,
255
256 HasCmp = 1,
257 HasAdd = 1,
258 HasSub = 1,
259 HasShift = 1,
260 HasMul = 1,
261 HasNegate = 0,
262 HasAbs = 1,
263 HasAbsDiff = 1,
264 HasArg = 0,
265 HasAbs2 = 1,
266 HasMin = 1,
267 HasMax = 1,
268 HasConj = 1,
269 HasSetLinear = 1,
270 HasBlend = 0,
271
272 HasSqrt = 1
273 };
274};
275
276template <>
277struct packet_traits<int16_t> : default_packet_traits {
278 typedef Packet8s type;
279 typedef Packet4s half;
280 enum {
281 Vectorizable = 1,
282 AlignedOnScalar = 1,
283 size = 8,
284
285 HasCmp = 1,
286 HasAdd = 1,
287 HasSub = 1,
288 HasShift = 1,
289 HasMul = 1,
290 HasNegate = 1,
291 HasAbs = 1,
292 HasAbsDiff = 1,
293 HasArg = 0,
294 HasAbs2 = 1,
295 HasMin = 1,
296 HasMax = 1,
297 HasConj = 1,
298 HasSetLinear = 1,
299 HasBlend = 0
300 };
301};
302
303template <>
304struct packet_traits<uint16_t> : default_packet_traits {
305 typedef Packet8us type;
306 typedef Packet4us half;
307 enum {
308 Vectorizable = 1,
309 AlignedOnScalar = 1,
310 size = 8,
311
312 HasCmp = 1,
313 HasAdd = 1,
314 HasSub = 1,
315 HasShift = 1,
316 HasMul = 1,
317 HasNegate = 0,
318 HasAbs = 1,
319 HasAbsDiff = 1,
320 HasArg = 0,
321 HasAbs2 = 1,
322 HasMin = 1,
323 HasMax = 1,
324 HasConj = 1,
325 HasSetLinear = 1,
326 HasBlend = 0,
327 HasSqrt = 1
328 };
329};
330
331template <>
332struct packet_traits<int32_t> : default_packet_traits {
333 typedef Packet4i type;
334 typedef Packet2i half;
335 enum {
336 Vectorizable = 1,
337 AlignedOnScalar = 1,
338 size = 4,
339
340 HasCmp = 1,
341 HasAdd = 1,
342 HasSub = 1,
343 HasShift = 1,
344 HasMul = 1,
345 HasNegate = 1,
346 HasAbs = 1,
347 HasArg = 0,
348 HasAbs2 = 1,
349 HasAbsDiff = 1,
350 HasMin = 1,
351 HasMax = 1,
352 HasConj = 1,
353 HasSetLinear = 1,
354 HasBlend = 0
355 };
356};
357
358template <>
359struct packet_traits<uint32_t> : default_packet_traits {
360 typedef Packet4ui type;
361 typedef Packet2ui half;
362 enum {
363 Vectorizable = 1,
364 AlignedOnScalar = 1,
365 size = 4,
366
367 HasCmp = 1,
368 HasAdd = 1,
369 HasSub = 1,
370 HasShift = 1,
371 HasMul = 1,
372 HasNegate = 0,
373 HasAbs = 1,
374 HasArg = 0,
375 HasAbs2 = 1,
376 HasAbsDiff = 1,
377 HasMin = 1,
378 HasMax = 1,
379 HasConj = 1,
380 HasSetLinear = 1,
381 HasBlend = 0,
382
383 HasSqrt = 1
384 };
385};
386
387template <>
388struct packet_traits<int64_t> : default_packet_traits {
389 typedef Packet2l type;
390 typedef Packet2l half;
391 enum {
392 Vectorizable = 1,
393 AlignedOnScalar = 1,
394 size = 2,
395
396 HasCmp = 1,
397 HasAdd = 1,
398 HasSub = 1,
399 HasShift = 1,
400 HasMul = 1,
401 HasNegate = 1,
402 HasAbs = 1,
403 HasArg = 0,
404 HasAbs2 = 1,
405 HasAbsDiff = 1,
406 HasMin = 1,
407 HasMax = 1,
408 HasConj = 1,
409 HasSetLinear = 1,
410 HasBlend = 0
411 };
412};
413
414template <>
415struct packet_traits<uint64_t> : default_packet_traits {
416 typedef Packet2ul type;
417 typedef Packet2ul half;
418 enum {
419 Vectorizable = 1,
420 AlignedOnScalar = 1,
421 size = 2,
422
423 HasCmp = 1,
424 HasAdd = 1,
425 HasSub = 1,
426 HasShift = 1,
427 HasMul = 1,
428 HasNegate = 0,
429 HasAbs = 1,
430 HasArg = 0,
431 HasAbs2 = 1,
432 HasAbsDiff = 1,
433 HasMin = 1,
434 HasMax = 1,
435 HasConj = 1,
436 HasSetLinear = 1,
437 HasBlend = 0
438 };
439};
440
441template <typename Packet, typename Scalar>
442struct neon_unpacket_default {
443 using type = Scalar;
444 using half = Packet;
445 static constexpr int size = sizeof(Packet) / sizeof(Scalar);
446 static constexpr int alignment = sizeof(Packet);
447 static constexpr bool vectorizable = true;
448 static constexpr bool masked_load_available = false;
449 static constexpr bool masked_store_available = false;
450};
451
452template <>
453struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
454 using integer_packet = Packet2i;
455};
456template <>
457struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
458 using half = Packet2f;
459 using integer_packet = Packet4i;
460};
461template <>
462struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
463template <>
464struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
465 using half = Packet4c;
466};
467template <>
468struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
469 using half = Packet8c;
470};
471template <>
472struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
473template <>
474struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
475 using half = Packet4uc;
476};
477template <>
478struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
479 using half = Packet8uc;
480};
481template <>
482struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
483template <>
484struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
485 using half = Packet4s;
486};
487template <>
488struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
489template <>
490struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
491 using half = Packet4us;
492};
493template <>
494struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
495template <>
496struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
497 using half = Packet2i;
498};
499template <>
500struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
501template <>
502struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
503 using half = Packet2ui;
504};
505template <>
506struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
507template <>
508struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
509
510template <>
511EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
512 return vdup_n_f32(0.0f);
513}
514
515template <>
516EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
517 return vdupq_n_f32(0.0f);
518}
519
520template <>
521EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
522 return vdup_n_f32(from);
523}
524template <>
525EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
526 return vdupq_n_f32(from);
527}
528template <>
529EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from) {
530 return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
531}
532template <>
533EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) {
534 return vdup_n_s8(from);
535}
536template <>
537EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
538 return vdupq_n_s8(from);
539}
540template <>
541EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from) {
542 return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
543}
544template <>
545EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) {
546 return vdup_n_u8(from);
547}
548template <>
549EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
550 return vdupq_n_u8(from);
551}
552template <>
553EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) {
554 return vdup_n_s16(from);
555}
556template <>
557EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
558 return vdupq_n_s16(from);
559}
560template <>
561EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) {
562 return vdup_n_u16(from);
563}
564template <>
565EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
566 return vdupq_n_u16(from);
567}
568template <>
569EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) {
570 return vdup_n_s32(from);
571}
572template <>
573EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
574 return vdupq_n_s32(from);
575}
576template <>
577EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) {
578 return vdup_n_u32(from);
579}
580template <>
581EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
582 return vdupq_n_u32(from);
583}
584template <>
585EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
586 return vdupq_n_s64(from);
587}
588template <>
589EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
590 return vdupq_n_u64(from);
591}
592
593template <>
594EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
595 return vreinterpret_f32_u32(vdup_n_u32(from));
596}
597template <>
598EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
599 return vreinterpretq_f32_u32(vdupq_n_u32(from));
600}
601
602template <>
603EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a) {
604 const float c[] = {0.0f, 1.0f};
605 return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
606}
607template <>
608EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
609 const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
610 return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
611}
612template <>
613EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a) {
614 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
615}
616template <>
617EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a) {
618 const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
619 return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
620}
621template <>
622EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
623 const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
624 return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
625}
626template <>
627EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a) {
628 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
629}
630template <>
631EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a) {
632 const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
633 return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
634}
635template <>
636EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
637 const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
638 return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
639}
640template <>
641EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a) {
642 const int16_t c[] = {0, 1, 2, 3};
643 return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
644}
645template <>
646EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a) {
647 const uint16_t c[] = {0, 1, 2, 3};
648 return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
649}
650template <>
651EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
652 const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
653 return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
654}
655template <>
656EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
657 const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
658 return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
659}
660template <>
661EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a) {
662 const int32_t c[] = {0, 1};
663 return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
664}
665template <>
666EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
667 const int32_t c[] = {0, 1, 2, 3};
668 return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
669}
670template <>
671EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a) {
672 const uint32_t c[] = {0, 1};
673 return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
674}
675template <>
676EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
677 const uint32_t c[] = {0, 1, 2, 3};
678 return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
679}
680template <>
681EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
682 const int64_t c[] = {0, 1};
683 return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
684}
685template <>
686EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
687 const uint64_t c[] = {0, 1};
688 return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
689}
690
691template <>
692EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) {
693 return vadd_f32(a, b);
694}
695template <>
696EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
697 return vaddq_f32(a, b);
698}
699template <>
700EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b) {
701 return vget_lane_s32(
702 vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
703}
704template <>
705EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) {
706 return vadd_s8(a, b);
707}
708template <>
709EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
710 return vaddq_s8(a, b);
711}
712template <>
713EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
714 return vget_lane_u32(
715 vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
716}
717template <>
718EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
719 return vadd_u8(a, b);
720}
721template <>
722EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
723 return vaddq_u8(a, b);
724}
725template <>
726EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) {
727 return vadd_s16(a, b);
728}
729template <>
730EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
731 return vaddq_s16(a, b);
732}
733template <>
734EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) {
735 return vadd_u16(a, b);
736}
737template <>
738EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
739 return vaddq_u16(a, b);
740}
741template <>
742EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) {
743 return vadd_s32(a, b);
744}
745template <>
746EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
747 return vaddq_s32(a, b);
748}
749template <>
750EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
751 return vadd_u32(a, b);
752}
753template <>
754EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
755 return vaddq_u32(a, b);
756}
757template <>
758EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
759 return vaddq_s64(a, b);
760}
761template <>
762EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
763 return vaddq_u64(a, b);
764}
765
766template <>
767EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) {
768 return vsub_f32(a, b);
769}
770template <>
771EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
772 return vsubq_f32(a, b);
773}
774template <>
775EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b) {
776 return vget_lane_s32(
777 vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
778}
779template <>
780EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) {
781 return vsub_s8(a, b);
782}
783template <>
784EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
785 return vsubq_s8(a, b);
786}
787template <>
788EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
789 return vget_lane_u32(
790 vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
791}
792template <>
793EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
794 return vsub_u8(a, b);
795}
796template <>
797EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
798 return vsubq_u8(a, b);
799}
800template <>
801EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) {
802 return vsub_s16(a, b);
803}
804template <>
805EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
806 return vsubq_s16(a, b);
807}
808template <>
809EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) {
810 return vsub_u16(a, b);
811}
812template <>
813EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
814 return vsubq_u16(a, b);
815}
816template <>
817EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) {
818 return vsub_s32(a, b);
819}
820template <>
821EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
822 return vsubq_s32(a, b);
823}
824template <>
825EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
826 return vsub_u32(a, b);
827}
828template <>
829EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
830 return vsubq_u32(a, b);
831}
832template <>
833EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
834 return vsubq_s64(a, b);
835}
836template <>
837EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
838 return vsubq_u64(a, b);
839}
840
841template <>
842EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
843template <>
844EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f& b) {
845 Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
846 return padd(a, pxor(mask, b));
847}
848template <>
849EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
850template <>
851EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
852 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
853 return padd(a, pxor(mask, b));
854}
855
856template <>
857EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) {
858 return vneg_f32(a);
859}
860template <>
861EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
862 return vnegq_f32(a);
863}
864template <>
865EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) {
866 return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
867}
868template <>
869EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) {
870 return vneg_s8(a);
871}
872template <>
873EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
874 return vnegq_s8(a);
875}
876template <>
877EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) {
878 return vneg_s16(a);
879}
880template <>
881EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
882 return vnegq_s16(a);
883}
884template <>
885EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) {
886 return vneg_s32(a);
887}
888template <>
889EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
890 return vnegq_s32(a);
891}
892template <>
893EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
894#if EIGEN_ARCH_ARM64
895 return vnegq_s64(a);
896#else
897 return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
898#endif
899}
900
901template <>
902EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) {
903 return a;
904}
905template <>
906EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
907 return a;
908}
909template <>
910EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) {
911 return a;
912}
913template <>
914EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) {
915 return a;
916}
917template <>
918EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
919 return a;
920}
921template <>
922EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) {
923 return a;
924}
925template <>
926EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) {
927 return a;
928}
929template <>
930EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
931 return a;
932}
933template <>
934EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) {
935 return a;
936}
937template <>
938EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
939 return a;
940}
941template <>
942EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) {
943 return a;
944}
945template <>
946EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
947 return a;
948}
949template <>
950EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) {
951 return a;
952}
953template <>
954EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
955 return a;
956}
957template <>
958EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) {
959 return a;
960}
961template <>
962EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
963 return a;
964}
965template <>
966EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
967 return a;
968}
969template <>
970EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
971 return a;
972}
973
974template <>
975EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) {
976 return vmul_f32(a, b);
977}
978template <>
979EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
980 return vmulq_f32(a, b);
981}
982template <>
983EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b) {
984 return vget_lane_s32(
985 vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
986}
987template <>
988EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) {
989 return vmul_s8(a, b);
990}
991template <>
992EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
993 return vmulq_s8(a, b);
994}
995template <>
996EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
997 return vget_lane_u32(
998 vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
999}
1000template <>
1001EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1002 return vmul_u8(a, b);
1003}
1004template <>
1005EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1006 return vmulq_u8(a, b);
1007}
1008template <>
1009EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) {
1010 return vmul_s16(a, b);
1011}
1012template <>
1013EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1014 return vmulq_s16(a, b);
1015}
1016template <>
1017EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) {
1018 return vmul_u16(a, b);
1019}
1020template <>
1021EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1022 return vmulq_u16(a, b);
1023}
1024template <>
1025EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) {
1026 return vmul_s32(a, b);
1027}
1028template <>
1029EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1030 return vmulq_s32(a, b);
1031}
1032template <>
1033EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1034 return vmul_u32(a, b);
1035}
1036template <>
1037EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1038 return vmulq_u32(a, b);
1039}
1040template <>
1041EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
1042 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
1043 vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
1044}
1045template <>
1046EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1047 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
1048 vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
1049}
1050
1051template <>
1052EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) {
1053 eigen_assert(false && "packet integer division are not supported by NEON");
1054 return pset1<Packet4c>(0);
1055}
1056template <>
1057EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/) {
1058 eigen_assert(false && "packet integer division are not supported by NEON");
1059 return pset1<Packet8c>(0);
1060}
1061template <>
1062EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/) {
1063 eigen_assert(false && "packet integer division are not supported by NEON");
1064 return pset1<Packet16c>(0);
1065}
1066template <>
1067EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/) {
1068 eigen_assert(false && "packet integer division are not supported by NEON");
1069 return pset1<Packet4uc>(0);
1070}
1071template <>
1072EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/) {
1073 eigen_assert(false && "packet integer division are not supported by NEON");
1074 return pset1<Packet8uc>(0);
1075}
1076template <>
1077EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/) {
1078 eigen_assert(false && "packet integer division are not supported by NEON");
1079 return pset1<Packet16uc>(0);
1080}
1081template <>
1082EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/) {
1083 eigen_assert(false && "packet integer division are not supported by NEON");
1084 return pset1<Packet4s>(0);
1085}
1086template <>
1087EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
1088 eigen_assert(false && "packet integer division are not supported by NEON");
1089 return pset1<Packet8s>(0);
1090}
1091template <>
1092EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/) {
1093 eigen_assert(false && "packet integer division are not supported by NEON");
1094 return pset1<Packet4us>(0);
1095}
1096template <>
1097EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
1098 eigen_assert(false && "packet integer division are not supported by NEON");
1099 return pset1<Packet8us>(0);
1100}
1101template <>
1102EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/) {
1103 eigen_assert(false && "packet integer division are not supported by NEON");
1104 return pset1<Packet2i>(0);
1105}
1106template <>
1107EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
1108 eigen_assert(false && "packet integer division are not supported by NEON");
1109 return pset1<Packet4i>(0);
1110}
1111template <>
1112EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/) {
1113 eigen_assert(false && "packet integer division are not supported by NEON");
1114 return pset1<Packet2ui>(0);
1115}
1116template <>
1117EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
1118 eigen_assert(false && "packet integer division are not supported by NEON");
1119 return pset1<Packet4ui>(0);
1120}
1121template <>
1122EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
1123 eigen_assert(false && "packet integer division are not supported by NEON");
1124 return pset1<Packet2l>(0LL);
1125}
1126template <>
1127EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
1128 eigen_assert(false && "packet integer division are not supported by NEON");
1129 return pset1<Packet2ul>(0ULL);
1130}
1131
1132#ifdef EIGEN_VECTORIZE_FMA
1133template <>
1134EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1135 return vfmaq_f32(c, a, b);
1136}
1137template <>
1138EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1139 return vfma_f32(c, a, b);
1140}
1141template <>
1142EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1143 return vfmsq_f32(c, a, b);
1144}
1145template <>
1146EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1147 return vfms_f32(c, a, b);
1148}
1149#else
1150template <>
1151EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1152 return vmlaq_f32(c, a, b);
1153}
1154template <>
1155EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1156 return vmla_f32(c, a, b);
1157}
1158template <>
1159EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1160 return vmlsq_f32(c, a, b);
1161}
1162template <>
1163EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1164 return vmls_f32(c, a, b);
1165}
1166#endif
1167template <>
1168EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1169 return pnegate(pnmadd(a, b, c));
1170}
1171template <>
1172EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1173 return pnegate(pnmadd(a, b, c));
1174}
1175template <>
1176EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1177 return pnegate(pmadd(a, b, c));
1178}
1179template <>
1180EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1181 return pnegate(pmadd(a, b, c));
1182}
1183
1184// No FMA instruction for int, so use MLA unconditionally.
1185template <>
1186EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) {
1187 return vget_lane_s32(
1188 vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
1189 vreinterpret_s8_s32(vdup_n_s32(b)))),
1190 0);
1191}
1192template <>
1193EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) {
1194 return vmla_s8(c, a, b);
1195}
1196template <>
1197EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
1198 return vmlaq_s8(c, a, b);
1199}
1200template <>
1201EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) {
1202 return vget_lane_u32(
1203 vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
1204 vreinterpret_u8_u32(vdup_n_u32(b)))),
1205 0);
1206}
1207template <>
1208EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) {
1209 return vmla_u8(c, a, b);
1210}
1211template <>
1212EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
1213 return vmlaq_u8(c, a, b);
1214}
1215template <>
1216EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) {
1217 return vmla_s16(c, a, b);
1218}
1219template <>
1220EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1221 return vmlaq_s16(c, a, b);
1222}
1223template <>
1224EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) {
1225 return vmla_u16(c, a, b);
1226}
1227template <>
1228EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1229 return vmlaq_u16(c, a, b);
1230}
1231template <>
1232EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) {
1233 return vmla_s32(c, a, b);
1234}
1235template <>
1236EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1237 return vmlaq_s32(c, a, b);
1238}
1239template <>
1240EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) {
1241 return vmla_u32(c, a, b);
1242}
1243template <>
1244EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
1245 return vmlaq_u32(c, a, b);
1246}
1247
1248template <>
1249EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b) {
1250 return vabd_f32(a, b);
1251}
1252template <>
1253EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
1254 return vabdq_f32(a, b);
1255}
1256template <>
1257EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b) {
1258 return vget_lane_s32(
1259 vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1260}
1261template <>
1262EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b) {
1263 return vabd_s8(a, b);
1264}
1265template <>
1266EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
1267 return vabdq_s8(a, b);
1268}
1269template <>
1270EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1271 return vget_lane_u32(
1272 vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1273}
1274template <>
1275EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1276 return vabd_u8(a, b);
1277}
1278template <>
1279EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1280 return vabdq_u8(a, b);
1281}
1282template <>
1283EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b) {
1284 return vabd_s16(a, b);
1285}
1286template <>
1287EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
1288 return vabdq_s16(a, b);
1289}
1290template <>
1291EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b) {
1292 return vabd_u16(a, b);
1293}
1294template <>
1295EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
1296 return vabdq_u16(a, b);
1297}
1298template <>
1299EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b) {
1300 return vabd_s32(a, b);
1301}
1302template <>
1303EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
1304 return vabdq_s32(a, b);
1305}
1306template <>
1307EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1308 return vabd_u32(a, b);
1309}
1310template <>
1311EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1312 return vabdq_u32(a, b);
1313}
1314
1315template <>
1316EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) {
1317 return vmin_f32(a, b);
1318}
1319template <>
1320EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1321 return vminq_f32(a, b);
1322}
1323
1324#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1325// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1326// systems).
1327template <>
1328EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1329 return vminnmq_f32(a, b);
1330}
1331template <>
1332EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1333 return vminnm_f32(a, b);
1334}
1335#endif
1336
1337template <>
1338EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1339 return pmin<Packet4f>(a, b);
1340}
1341
1342template <>
1343EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
1344 return pmin<Packet2f>(a, b);
1345}
1346
1347template <>
1348EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b) {
1349 return vget_lane_s32(
1350 vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1351}
1352template <>
1353EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) {
1354 return vmin_s8(a, b);
1355}
1356template <>
1357EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1358 return vminq_s8(a, b);
1359}
1360template <>
1361EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1362 return vget_lane_u32(
1363 vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1364}
1365template <>
1366EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1367 return vmin_u8(a, b);
1368}
1369template <>
1370EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1371 return vminq_u8(a, b);
1372}
1373template <>
1374EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) {
1375 return vmin_s16(a, b);
1376}
1377template <>
1378EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1379 return vminq_s16(a, b);
1380}
1381template <>
1382EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) {
1383 return vmin_u16(a, b);
1384}
1385template <>
1386EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1387 return vminq_u16(a, b);
1388}
1389template <>
1390EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) {
1391 return vmin_s32(a, b);
1392}
1393template <>
1394EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1395 return vminq_s32(a, b);
1396}
1397template <>
1398EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1399 return vmin_u32(a, b);
1400}
1401template <>
1402EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1403 return vminq_u32(a, b);
1404}
1405template <>
1406EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1407 return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1408 vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1409}
1410template <>
1411EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1412 return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1413 vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1414}
1415
1416template <>
1417EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) {
1418 return vmax_f32(a, b);
1419}
1420template <>
1421EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1422 return vmaxq_f32(a, b);
1423}
1424
1425#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1426// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1427// systems).
1428template <>
1429EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1430 return vmaxnmq_f32(a, b);
1431}
1432template <>
1433EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1434 return vmaxnm_f32(a, b);
1435}
1436#endif
1437
1438template <>
1439EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1440 return pmax<Packet4f>(a, b);
1441}
1442
1443template <>
1444EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
1445 return pmax<Packet2f>(a, b);
1446}
1447
1448template <>
1449EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b) {
1450 return vget_lane_s32(
1451 vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1452}
1453template <>
1454EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) {
1455 return vmax_s8(a, b);
1456}
1457template <>
1458EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1459 return vmaxq_s8(a, b);
1460}
1461template <>
1462EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1463 return vget_lane_u32(
1464 vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1465}
1466template <>
1467EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1468 return vmax_u8(a, b);
1469}
1470template <>
1471EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1472 return vmaxq_u8(a, b);
1473}
1474template <>
1475EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) {
1476 return vmax_s16(a, b);
1477}
1478template <>
1479EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1480 return vmaxq_s16(a, b);
1481}
1482template <>
1483EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) {
1484 return vmax_u16(a, b);
1485}
1486template <>
1487EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1488 return vmaxq_u16(a, b);
1489}
1490template <>
1491EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) {
1492 return vmax_s32(a, b);
1493}
1494template <>
1495EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1496 return vmaxq_s32(a, b);
1497}
1498template <>
1499EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1500 return vmax_u32(a, b);
1501}
1502template <>
1503EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1504 return vmaxq_u32(a, b);
1505}
1506template <>
1507EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1508 return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1509 vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1510}
1511template <>
1512EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1513 return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1514 vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1515}
1516
1517template <>
1518EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b) {
1519 return vreinterpret_f32_u32(vcle_f32(a, b));
1520}
1521template <>
1522EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1523 return vreinterpretq_f32_u32(vcleq_f32(a, b));
1524}
1525template <>
1526EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b) {
1527 return vget_lane_s32(
1528 vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1529}
1530template <>
1531EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b) {
1532 return vreinterpret_s8_u8(vcle_s8(a, b));
1533}
1534template <>
1535EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
1536 return vreinterpretq_s8_u8(vcleq_s8(a, b));
1537}
1538template <>
1539EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1540 return vget_lane_u32(
1541 vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1542}
1543template <>
1544EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1545 return vcle_u8(a, b);
1546}
1547template <>
1548EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1549 return vcleq_u8(a, b);
1550}
1551template <>
1552EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b) {
1553 return vreinterpret_s16_u16(vcle_s16(a, b));
1554}
1555template <>
1556EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
1557 return vreinterpretq_s16_u16(vcleq_s16(a, b));
1558}
1559template <>
1560EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b) {
1561 return vcle_u16(a, b);
1562}
1563template <>
1564EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
1565 return vcleq_u16(a, b);
1566}
1567template <>
1568EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b) {
1569 return vreinterpret_s32_u32(vcle_s32(a, b));
1570}
1571template <>
1572EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
1573 return vreinterpretq_s32_u32(vcleq_s32(a, b));
1574}
1575template <>
1576EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1577 return vcle_u32(a, b);
1578}
1579template <>
1580EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1581 return vcleq_u32(a, b);
1582}
1583template <>
1584EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
1585#if EIGEN_ARCH_ARM64
1586 return vreinterpretq_s64_u64(vcleq_s64(a, b));
1587#else
1588 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1589 vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1590#endif
1591}
1592template <>
1593EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1594#if EIGEN_ARCH_ARM64
1595 return vcleq_u64(a, b);
1596#else
1597 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1598 vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1599#endif
1600}
1601
1602template <>
1603EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b) {
1604 return vreinterpret_f32_u32(vclt_f32(a, b));
1605}
1606template <>
1607EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1608 return vreinterpretq_f32_u32(vcltq_f32(a, b));
1609}
1610template <>
1611EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b) {
1612 return vget_lane_s32(
1613 vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1614}
1615template <>
1616EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b) {
1617 return vreinterpret_s8_u8(vclt_s8(a, b));
1618}
1619template <>
1620EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
1621 return vreinterpretq_s8_u8(vcltq_s8(a, b));
1622}
1623template <>
1624EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1625 return vget_lane_u32(
1626 vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1627}
1628template <>
1629EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1630 return vclt_u8(a, b);
1631}
1632template <>
1633EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1634 return vcltq_u8(a, b);
1635}
1636template <>
1637EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b) {
1638 return vreinterpret_s16_u16(vclt_s16(a, b));
1639}
1640template <>
1641EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
1642 return vreinterpretq_s16_u16(vcltq_s16(a, b));
1643}
1644template <>
1645EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b) {
1646 return vclt_u16(a, b);
1647}
1648template <>
1649EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
1650 return vcltq_u16(a, b);
1651}
1652template <>
1653EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b) {
1654 return vreinterpret_s32_u32(vclt_s32(a, b));
1655}
1656template <>
1657EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
1658 return vreinterpretq_s32_u32(vcltq_s32(a, b));
1659}
1660template <>
1661EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1662 return vclt_u32(a, b);
1663}
1664template <>
1665EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1666 return vcltq_u32(a, b);
1667}
1668template <>
1669EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
1670#if EIGEN_ARCH_ARM64
1671 return vreinterpretq_s64_u64(vcltq_s64(a, b));
1672#else
1673 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1674 vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1675#endif
1676}
1677template <>
1678EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1679#if EIGEN_ARCH_ARM64
1680 return vcltq_u64(a, b);
1681#else
1682 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1683 vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1684#endif
1685}
1686
1687template <>
1688EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b) {
1689 return vreinterpret_f32_u32(vceq_f32(a, b));
1690}
1691template <>
1692EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1693 return vreinterpretq_f32_u32(vceqq_f32(a, b));
1694}
1695template <>
1696EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b) {
1697 return vget_lane_s32(
1698 vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1699}
1700template <>
1701EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b) {
1702 return vreinterpret_s8_u8(vceq_s8(a, b));
1703}
1704template <>
1705EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
1706 return vreinterpretq_s8_u8(vceqq_s8(a, b));
1707}
1708template <>
1709EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1710 return vget_lane_u32(
1711 vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1712}
1713template <>
1714EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1715 return vceq_u8(a, b);
1716}
1717template <>
1718EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1719 return vceqq_u8(a, b);
1720}
1721template <>
1722EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b) {
1723 return vreinterpret_s16_u16(vceq_s16(a, b));
1724}
1725template <>
1726EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
1727 return vreinterpretq_s16_u16(vceqq_s16(a, b));
1728}
1729template <>
1730EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b) {
1731 return vceq_u16(a, b);
1732}
1733template <>
1734EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
1735 return vceqq_u16(a, b);
1736}
1737template <>
1738EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b) {
1739 return vreinterpret_s32_u32(vceq_s32(a, b));
1740}
1741template <>
1742EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
1743 return vreinterpretq_s32_u32(vceqq_s32(a, b));
1744}
1745template <>
1746EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1747 return vceq_u32(a, b);
1748}
1749template <>
1750EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1751 return vceqq_u32(a, b);
1752}
1753template <>
1754EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
1755#if EIGEN_ARCH_ARM64
1756 return vreinterpretq_s64_u64(vceqq_s64(a, b));
1757#else
1758 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1759 vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1760#endif
1761}
1762template <>
1763EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1764#if EIGEN_ARCH_ARM64
1765 return vceqq_u64(a, b);
1766#else
1767 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1768 vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1769#endif
1770}
1771
1772template <>
1773EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b) {
1774 return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
1775}
1776template <>
1777EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
1778 return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
1779}
1780
1781// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
1782template <>
1783EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b) {
1784 return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1785}
1786template <>
1787EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1788 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1789}
1790template <>
1791EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b) {
1792 return a & b;
1793}
1794template <>
1795EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b) {
1796 return vand_s8(a, b);
1797}
1798template <>
1799EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
1800 return vandq_s8(a, b);
1801}
1802template <>
1803EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1804 return a & b;
1805}
1806template <>
1807EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1808 return vand_u8(a, b);
1809}
1810template <>
1811EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1812 return vandq_u8(a, b);
1813}
1814template <>
1815EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) {
1816 return vand_s16(a, b);
1817}
1818template <>
1819EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
1820 return vandq_s16(a, b);
1821}
1822template <>
1823EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b) {
1824 return vand_u16(a, b);
1825}
1826template <>
1827EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1828 return vandq_u16(a, b);
1829}
1830template <>
1831EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) {
1832 return vand_s32(a, b);
1833}
1834template <>
1835EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1836 return vandq_s32(a, b);
1837}
1838template <>
1839EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1840 return vand_u32(a, b);
1841}
1842template <>
1843EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1844 return vandq_u32(a, b);
1845}
1846template <>
1847EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
1848 return vandq_s64(a, b);
1849}
1850template <>
1851EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1852 return vandq_u64(a, b);
1853}
1854
1855template <>
1856EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b) {
1857 return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1858}
1859template <>
1860EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1861 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1862}
1863template <>
1864EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b) {
1865 return a | b;
1866}
1867template <>
1868EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) {
1869 return vorr_s8(a, b);
1870}
1871template <>
1872EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
1873 return vorrq_s8(a, b);
1874}
1875template <>
1876EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1877 return a | b;
1878}
1879template <>
1880EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1881 return vorr_u8(a, b);
1882}
1883template <>
1884EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1885 return vorrq_u8(a, b);
1886}
1887template <>
1888EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b) {
1889 return vorr_s16(a, b);
1890}
1891template <>
1892EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1893 return vorrq_s16(a, b);
1894}
1895template <>
1896EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b) {
1897 return vorr_u16(a, b);
1898}
1899template <>
1900EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1901 return vorrq_u16(a, b);
1902}
1903template <>
1904EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) {
1905 return vorr_s32(a, b);
1906}
1907template <>
1908EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1909 return vorrq_s32(a, b);
1910}
1911template <>
1912EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1913 return vorr_u32(a, b);
1914}
1915template <>
1916EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1917 return vorrq_u32(a, b);
1918}
1919template <>
1920EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
1921 return vorrq_s64(a, b);
1922}
1923template <>
1924EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1925 return vorrq_u64(a, b);
1926}
1927
1928template <>
1929EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b) {
1930 return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1931}
1932template <>
1933EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1934 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1935}
1936template <>
1937EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b) {
1938 return a ^ b;
1939}
1940template <>
1941EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b) {
1942 return veor_s8(a, b);
1943}
1944template <>
1945EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
1946 return veorq_s8(a, b);
1947}
1948template <>
1949EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1950 return a ^ b;
1951}
1952template <>
1953EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1954 return veor_u8(a, b);
1955}
1956template <>
1957EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1958 return veorq_u8(a, b);
1959}
1960template <>
1961EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) {
1962 return veor_s16(a, b);
1963}
1964template <>
1965EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
1966 return veorq_s16(a, b);
1967}
1968template <>
1969EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b) {
1970 return veor_u16(a, b);
1971}
1972template <>
1973EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1974 return veorq_u16(a, b);
1975}
1976template <>
1977EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) {
1978 return veor_s32(a, b);
1979}
1980template <>
1981EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1982 return veorq_s32(a, b);
1983}
1984template <>
1985EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1986 return veor_u32(a, b);
1987}
1988template <>
1989EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1990 return veorq_u32(a, b);
1991}
1992template <>
1993EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
1994 return veorq_s64(a, b);
1995}
1996template <>
1997EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1998 return veorq_u64(a, b);
1999}
2000
2001template <>
2002EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b) {
2003 return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
2004}
2005template <>
2006EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
2007 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
2008}
2009template <>
2010EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b) {
2011 return a & ~b;
2012}
2013template <>
2014EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) {
2015 return vbic_s8(a, b);
2016}
2017template <>
2018EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
2019 return vbicq_s8(a, b);
2020}
2021template <>
2022EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
2023 return a & ~b;
2024}
2025template <>
2026EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
2027 return vbic_u8(a, b);
2028}
2029template <>
2030EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2031 return vbicq_u8(a, b);
2032}
2033template <>
2034EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b) {
2035 return vbic_s16(a, b);
2036}
2037template <>
2038EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
2039 return vbicq_s16(a, b);
2040}
2041template <>
2042EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b) {
2043 return vbic_u16(a, b);
2044}
2045template <>
2046EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
2047 return vbicq_u16(a, b);
2048}
2049template <>
2050EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b) {
2051 return vbic_s32(a, b);
2052}
2053template <>
2054EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
2055 return vbicq_s32(a, b);
2056}
2057template <>
2058EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
2059 return vbic_u32(a, b);
2060}
2061template <>
2062EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
2063 return vbicq_u32(a, b);
2064}
2065template <>
2066EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
2067 return vbicq_s64(a, b);
2068}
2069template <>
2070EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
2071 return vbicq_u64(a, b);
2072}
2073
2074template <int N>
2075EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
2076 return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2077}
2078template <int N>
2079EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
2080 return vshr_n_s8(a, N);
2081}
2082template <int N>
2083EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
2084 return vshrq_n_s8(a, N);
2085}
2086template <int N>
2087EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
2088 return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2089}
2090template <int N>
2091EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
2092 return vshr_n_u8(a, N);
2093}
2094template <int N>
2095EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
2096 return vshrq_n_u8(a, N);
2097}
2098template <int N>
2099EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
2100 return vshr_n_s16(a, N);
2101}
2102template <int N>
2103EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
2104 return vshrq_n_s16(a, N);
2105}
2106template <int N>
2107EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
2108 return vshr_n_u16(a, N);
2109}
2110template <int N>
2111EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
2112 return vshrq_n_u16(a, N);
2113}
2114template <int N>
2115EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
2116 return vshr_n_s32(a, N);
2117}
2118template <int N>
2119EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
2120 return vshrq_n_s32(a, N);
2121}
2122template <int N>
2123EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
2124 return vshr_n_u32(a, N);
2125}
2126template <int N>
2127EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
2128 return vshrq_n_u32(a, N);
2129}
2130template <int N>
2131EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
2132 return vshrq_n_s64(a, N);
2133}
2134template <int N>
2135EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
2136 return vshrq_n_u64(a, N);
2137}
2138
2139template <int N>
2140EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
2141 return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
2142}
2143template <int N>
2144EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
2145 return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
2146}
2147template <int N>
2148EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
2149 return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
2150}
2151template <int N>
2152EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
2153 return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
2154}
2155template <int N>
2156EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
2157 return vshr_n_u8(a, N);
2158}
2159template <int N>
2160EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
2161 return vshrq_n_u8(a, N);
2162}
2163template <int N>
2164EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
2165 return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
2166}
2167template <int N>
2168EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
2169 return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
2170}
2171template <int N>
2172EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
2173 return vshr_n_u16(a, N);
2174}
2175template <int N>
2176EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
2177 return vshrq_n_u16(a, N);
2178}
2179template <int N>
2180EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
2181 return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
2182}
2183template <int N>
2184EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
2185 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
2186}
2187template <int N>
2188EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
2189 return vshr_n_u32(a, N);
2190}
2191template <int N>
2192EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
2193 return vshrq_n_u32(a, N);
2194}
2195template <int N>
2196EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
2197 return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
2198}
2199template <int N>
2200EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
2201 return vshrq_n_u64(a, N);
2202}
2203
2204template <int N>
2205EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
2206 return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2207}
2208template <int N>
2209EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
2210 return vshl_n_s8(a, N);
2211}
2212template <int N>
2213EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
2214 return vshlq_n_s8(a, N);
2215}
2216template <int N>
2217EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
2218 return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2219}
2220template <int N>
2221EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
2222 return vshl_n_u8(a, N);
2223}
2224template <int N>
2225EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
2226 return vshlq_n_u8(a, N);
2227}
2228template <int N>
2229EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
2230 return vshl_n_s16(a, N);
2231}
2232template <int N>
2233EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
2234 return vshlq_n_s16(a, N);
2235}
2236template <int N>
2237EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
2238 return vshl_n_u16(a, N);
2239}
2240template <int N>
2241EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
2242 return vshlq_n_u16(a, N);
2243}
2244template <int N>
2245EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
2246 return vshl_n_s32(a, N);
2247}
2248template <int N>
2249EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
2250 return vshlq_n_s32(a, N);
2251}
2252template <int N>
2253EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
2254 return vshl_n_u32(a, N);
2255}
2256template <int N>
2257EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
2258 return vshlq_n_u32(a, N);
2259}
2260template <int N>
2261EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
2262 return vshlq_n_s64(a, N);
2263}
2264template <int N>
2265EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
2266 return vshlq_n_u64(a, N);
2267}
2268
2269template <>
2270EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
2271 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2f>::alignment);
2272 EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
2273}
2274template <>
2275EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
2276 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4f>::alignment);
2277 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
2278}
2279template <>
2280EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
2281 Packet4c res;
2282 memcpy(&res, from, sizeof(Packet4c));
2283 return res;
2284}
2285template <>
2286EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
2287 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8c>::alignment);
2288 EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
2289}
2290template <>
2291EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
2292 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet16c>::alignment);
2293 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
2294}
2295template <>
2296EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
2297 Packet4uc res;
2298 memcpy(&res, from, sizeof(Packet4uc));
2299 return res;
2300}
2301template <>
2302EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
2303 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8uc>::alignment);
2304 EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
2305}
2306template <>
2307EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
2308 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet16uc>::alignment);
2309 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
2310}
2311template <>
2312EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
2313 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4s>::alignment);
2314 EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
2315}
2316template <>
2317EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
2318 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8s>::alignment);
2319 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
2320}
2321template <>
2322EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
2323 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4us>::alignment);
2324 EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
2325}
2326template <>
2327EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
2328 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8us>::alignment);
2329 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
2330}
2331template <>
2332EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
2333 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2i>::alignment);
2334 EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
2335}
2336template <>
2337EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
2338 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4i>::alignment);
2339 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
2340}
2341template <>
2342EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
2343 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2ui>::alignment);
2344 EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
2345}
2346template <>
2347EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
2348 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4ui>::alignment);
2349 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
2350}
2351template <>
2352EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
2353 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2l>::alignment);
2354 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
2355}
2356template <>
2357EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
2358 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2ul>::alignment);
2359 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
2360}
2361
2362template <>
2363EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from) {
2364 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
2365}
2366template <>
2367EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
2368 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
2369}
2370template <>
2371EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from) {
2372 Packet4c res;
2373 memcpy(&res, from, sizeof(Packet4c));
2374 return res;
2375}
2376template <>
2377EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from) {
2378 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
2379}
2380template <>
2381EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
2382 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
2383}
2384template <>
2385EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from) {
2386 Packet4uc res;
2387 memcpy(&res, from, sizeof(Packet4uc));
2388 return res;
2389}
2390template <>
2391EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from) {
2392 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
2393}
2394template <>
2395EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
2396 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
2397}
2398template <>
2399EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from) {
2400 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
2401}
2402template <>
2403EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
2404 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
2405}
2406template <>
2407EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from) {
2408 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
2409}
2410template <>
2411EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
2412 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
2413}
2414template <>
2415EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from) {
2416 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
2417}
2418template <>
2419EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
2420 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
2421}
2422template <>
2423EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from) {
2424 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
2425}
2426template <>
2427EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
2428 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
2429}
2430template <>
2431EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
2432 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
2433}
2434template <>
2435EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
2436 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
2437}
2438
2439template <>
2440EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from) {
2441 return vld1_dup_f32(from);
2442}
2443template <>
2444EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
2445 return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
2446}
2447template <>
2448EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from) {
2449 const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
2450 return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
2451}
2452template <>
2453EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from) {
2454 const int8x8_t a = vld1_s8(from);
2455 return vzip_s8(a, a).val[0];
2456}
2457template <>
2458EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
2459 const int8x8_t a = vld1_s8(from);
2460 const int8x8x2_t b = vzip_s8(a, a);
2461 return vcombine_s8(b.val[0], b.val[1]);
2462}
2463template <>
2464EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from) {
2465 const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
2466 return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
2467}
2468template <>
2469EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from) {
2470 const uint8x8_t a = vld1_u8(from);
2471 return vzip_u8(a, a).val[0];
2472}
2473template <>
2474EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
2475 const uint8x8_t a = vld1_u8(from);
2476 const uint8x8x2_t b = vzip_u8(a, a);
2477 return vcombine_u8(b.val[0], b.val[1]);
2478}
2479template <>
2480EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from) {
2481 return vreinterpret_s16_u32(
2482 vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
2483}
2484template <>
2485EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
2486 const int16x4_t a = vld1_s16(from);
2487 const int16x4x2_t b = vzip_s16(a, a);
2488 return vcombine_s16(b.val[0], b.val[1]);
2489}
2490template <>
2491EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from) {
2492 return vreinterpret_u16_u32(
2493 vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
2494}
2495template <>
2496EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
2497 const uint16x4_t a = vld1_u16(from);
2498 const uint16x4x2_t b = vzip_u16(a, a);
2499 return vcombine_u16(b.val[0], b.val[1]);
2500}
2501template <>
2502EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from) {
2503 return vld1_dup_s32(from);
2504}
2505template <>
2506EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
2507 return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
2508}
2509template <>
2510EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from) {
2511 return vld1_dup_u32(from);
2512}
2513template <>
2514EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
2515 return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
2516}
2517template <>
2518EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
2519 return vld1q_dup_s64(from);
2520}
2521template <>
2522EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
2523 return vld1q_dup_u64(from);
2524}
2525
2526template <>
2527EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
2528 return vld1q_dup_f32(from);
2529}
2530template <>
2531EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
2532 return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
2533}
2534template <>
2535EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
2536 return vreinterpret_s8_u32(
2537 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2538}
2539template <>
2540EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
2541 const int8x8_t a = vreinterpret_s8_u32(
2542 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2543 const int8x8_t b = vreinterpret_s8_u32(
2544 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
2545 return vcombine_s8(a, b);
2546}
2547template <>
2548EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
2549 return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
2550}
2551template <>
2552EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
2553 return vreinterpret_u8_u32(
2554 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2555}
2556template <>
2557EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
2558 const uint8x8_t a = vreinterpret_u8_u32(
2559 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2560 const uint8x8_t b = vreinterpret_u8_u32(
2561 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
2562 return vcombine_u8(a, b);
2563}
2564template <>
2565EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
2566 return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
2567}
2568template <>
2569EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
2570 return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
2571}
2572template <>
2573EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
2574 return vld1q_dup_s32(from);
2575}
2576template <>
2577EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
2578 return vld1q_dup_u32(from);
2579}
2580
2581template <>
2582EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
2583 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2f>::alignment);
2584 EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
2585}
2586template <>
2587EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
2588 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4f>::alignment);
2589 EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
2590}
2591template <>
2592EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
2593 memcpy(to, &from, sizeof(from));
2594}
2595template <>
2596EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
2597 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8c>::alignment);
2598 EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
2599}
2600template <>
2601EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
2602 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet16c>::alignment);
2603 EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
2604}
2605template <>
2606EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
2607 memcpy(to, &from, sizeof(from));
2608}
2609template <>
2610EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
2611 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8uc>::alignment);
2612 EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
2613}
2614template <>
2615EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
2616 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet16uc>::alignment);
2617 EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
2618}
2619template <>
2620EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
2621 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4s>::alignment);
2622 EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
2623}
2624template <>
2625EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
2626 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8s>::alignment);
2627 EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
2628}
2629template <>
2630EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
2631 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4us>::alignment);
2632 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
2633}
2634template <>
2635EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
2636 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8us>::alignment);
2637 EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
2638}
2639template <>
2640EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
2641 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2i>::alignment);
2642 EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
2643}
2644template <>
2645EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
2646 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4i>::alignment);
2647 EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
2648}
2649template <>
2650EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
2651 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2ui>::alignment);
2652 EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
2653}
2654template <>
2655EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
2656 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4ui>::alignment);
2657 EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
2658}
2659template <>
2660EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
2661 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2l>::alignment);
2662 EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
2663}
2664template <>
2665EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
2666 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2ul>::alignment);
2667 EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
2668}
2669
2670template <>
2671EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
2672 EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
2673}
2674template <>
2675EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
2676 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
2677}
2678template <>
2679EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from) {
2680 memcpy(to, &from, sizeof(from));
2681}
2682template <>
2683EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from) {
2684 EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
2685}
2686template <>
2687EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
2688 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
2689}
2690template <>
2691EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from) {
2692 memcpy(to, &from, sizeof(from));
2693}
2694template <>
2695EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from) {
2696 EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
2697}
2698template <>
2699EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
2700 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
2701}
2702template <>
2703EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from) {
2704 EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
2705}
2706template <>
2707EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
2708 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
2709}
2710template <>
2711EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from) {
2712 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
2713}
2714template <>
2715EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
2716 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
2717}
2718template <>
2719EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from) {
2720 EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
2721}
2722template <>
2723EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
2724 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
2725}
2726template <>
2727EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from) {
2728 EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
2729}
2730template <>
2731EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
2732 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
2733}
2734template <>
2735EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
2736 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
2737}
2738template <>
2739EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
2740 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
2741}
2742
2743template <>
2744EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride) {
2745 Packet2f res = vld1_dup_f32(from);
2746 res = vld1_lane_f32(from + 1 * stride, res, 1);
2747 return res;
2748}
2749template <>
2750EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
2751 Packet4f res = vld1q_dup_f32(from);
2752 res = vld1q_lane_f32(from + 1 * stride, res, 1);
2753 res = vld1q_lane_f32(from + 2 * stride, res, 2);
2754 res = vld1q_lane_f32(from + 3 * stride, res, 3);
2755 return res;
2756}
2757template <>
2758EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride) {
2759 Packet4c res;
2760 for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
2761 return res;
2762}
2763template <>
2764EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride) {
2765 Packet8c res = vld1_dup_s8(from);
2766 res = vld1_lane_s8(from + 1 * stride, res, 1);
2767 res = vld1_lane_s8(from + 2 * stride, res, 2);
2768 res = vld1_lane_s8(from + 3 * stride, res, 3);
2769 res = vld1_lane_s8(from + 4 * stride, res, 4);
2770 res = vld1_lane_s8(from + 5 * stride, res, 5);
2771 res = vld1_lane_s8(from + 6 * stride, res, 6);
2772 res = vld1_lane_s8(from + 7 * stride, res, 7);
2773 return res;
2774}
2775template <>
2776EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
2777 Packet16c res = vld1q_dup_s8(from);
2778 res = vld1q_lane_s8(from + 1 * stride, res, 1);
2779 res = vld1q_lane_s8(from + 2 * stride, res, 2);
2780 res = vld1q_lane_s8(from + 3 * stride, res, 3);
2781 res = vld1q_lane_s8(from + 4 * stride, res, 4);
2782 res = vld1q_lane_s8(from + 5 * stride, res, 5);
2783 res = vld1q_lane_s8(from + 6 * stride, res, 6);
2784 res = vld1q_lane_s8(from + 7 * stride, res, 7);
2785 res = vld1q_lane_s8(from + 8 * stride, res, 8);
2786 res = vld1q_lane_s8(from + 9 * stride, res, 9);
2787 res = vld1q_lane_s8(from + 10 * stride, res, 10);
2788 res = vld1q_lane_s8(from + 11 * stride, res, 11);
2789 res = vld1q_lane_s8(from + 12 * stride, res, 12);
2790 res = vld1q_lane_s8(from + 13 * stride, res, 13);
2791 res = vld1q_lane_s8(from + 14 * stride, res, 14);
2792 res = vld1q_lane_s8(from + 15 * stride, res, 15);
2793 return res;
2794}
2795template <>
2796EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride) {
2797 Packet4uc res;
2798 for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
2799 return res;
2800}
2801template <>
2802EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride) {
2803 Packet8uc res = vld1_dup_u8(from);
2804 res = vld1_lane_u8(from + 1 * stride, res, 1);
2805 res = vld1_lane_u8(from + 2 * stride, res, 2);
2806 res = vld1_lane_u8(from + 3 * stride, res, 3);
2807 res = vld1_lane_u8(from + 4 * stride, res, 4);
2808 res = vld1_lane_u8(from + 5 * stride, res, 5);
2809 res = vld1_lane_u8(from + 6 * stride, res, 6);
2810 res = vld1_lane_u8(from + 7 * stride, res, 7);
2811 return res;
2812}
2813template <>
2814EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
2815 Packet16uc res = vld1q_dup_u8(from);
2816 res = vld1q_lane_u8(from + 1 * stride, res, 1);
2817 res = vld1q_lane_u8(from + 2 * stride, res, 2);
2818 res = vld1q_lane_u8(from + 3 * stride, res, 3);
2819 res = vld1q_lane_u8(from + 4 * stride, res, 4);
2820 res = vld1q_lane_u8(from + 5 * stride, res, 5);
2821 res = vld1q_lane_u8(from + 6 * stride, res, 6);
2822 res = vld1q_lane_u8(from + 7 * stride, res, 7);
2823 res = vld1q_lane_u8(from + 8 * stride, res, 8);
2824 res = vld1q_lane_u8(from + 9 * stride, res, 9);
2825 res = vld1q_lane_u8(from + 10 * stride, res, 10);
2826 res = vld1q_lane_u8(from + 11 * stride, res, 11);
2827 res = vld1q_lane_u8(from + 12 * stride, res, 12);
2828 res = vld1q_lane_u8(from + 13 * stride, res, 13);
2829 res = vld1q_lane_u8(from + 14 * stride, res, 14);
2830 res = vld1q_lane_u8(from + 15 * stride, res, 15);
2831 return res;
2832}
2833template <>
2834EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride) {
2835 Packet4s res = vld1_dup_s16(from);
2836 res = vld1_lane_s16(from + 1 * stride, res, 1);
2837 res = vld1_lane_s16(from + 2 * stride, res, 2);
2838 res = vld1_lane_s16(from + 3 * stride, res, 3);
2839 return res;
2840}
2841template <>
2842EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
2843 Packet8s res = vld1q_dup_s16(from);
2844 res = vld1q_lane_s16(from + 1 * stride, res, 1);
2845 res = vld1q_lane_s16(from + 2 * stride, res, 2);
2846 res = vld1q_lane_s16(from + 3 * stride, res, 3);
2847 res = vld1q_lane_s16(from + 4 * stride, res, 4);
2848 res = vld1q_lane_s16(from + 5 * stride, res, 5);
2849 res = vld1q_lane_s16(from + 6 * stride, res, 6);
2850 res = vld1q_lane_s16(from + 7 * stride, res, 7);
2851 return res;
2852}
2853template <>
2854EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride) {
2855 Packet4us res = vld1_dup_u16(from);
2856 res = vld1_lane_u16(from + 1 * stride, res, 1);
2857 res = vld1_lane_u16(from + 2 * stride, res, 2);
2858 res = vld1_lane_u16(from + 3 * stride, res, 3);
2859 return res;
2860}
2861template <>
2862EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
2863 Packet8us res = vld1q_dup_u16(from);
2864 res = vld1q_lane_u16(from + 1 * stride, res, 1);
2865 res = vld1q_lane_u16(from + 2 * stride, res, 2);
2866 res = vld1q_lane_u16(from + 3 * stride, res, 3);
2867 res = vld1q_lane_u16(from + 4 * stride, res, 4);
2868 res = vld1q_lane_u16(from + 5 * stride, res, 5);
2869 res = vld1q_lane_u16(from + 6 * stride, res, 6);
2870 res = vld1q_lane_u16(from + 7 * stride, res, 7);
2871 return res;
2872}
2873template <>
2874EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride) {
2875 Packet2i res = vld1_dup_s32(from);
2876 res = vld1_lane_s32(from + 1 * stride, res, 1);
2877 return res;
2878}
2879template <>
2880EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
2881 Packet4i res = vld1q_dup_s32(from);
2882 res = vld1q_lane_s32(from + 1 * stride, res, 1);
2883 res = vld1q_lane_s32(from + 2 * stride, res, 2);
2884 res = vld1q_lane_s32(from + 3 * stride, res, 3);
2885 return res;
2886}
2887template <>
2888EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride) {
2889 Packet2ui res = vld1_dup_u32(from);
2890 res = vld1_lane_u32(from + 1 * stride, res, 1);
2891 return res;
2892}
2893template <>
2894EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
2895 Packet4ui res = vld1q_dup_u32(from);
2896 res = vld1q_lane_u32(from + 1 * stride, res, 1);
2897 res = vld1q_lane_u32(from + 2 * stride, res, 2);
2898 res = vld1q_lane_u32(from + 3 * stride, res, 3);
2899 return res;
2900}
2901template <>
2902EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
2903 Packet2l res = vld1q_dup_s64(from);
2904 res = vld1q_lane_s64(from + 1 * stride, res, 1);
2905 return res;
2906}
2907template <>
2908EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
2909 Packet2ul res = vld1q_dup_u64(from);
2910 res = vld1q_lane_u64(from + 1 * stride, res, 1);
2911 return res;
2912}
2913
2914template <>
2915EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride) {
2916 vst1_lane_f32(to + stride * 0, from, 0);
2917 vst1_lane_f32(to + stride * 1, from, 1);
2918}
2919template <>
2920EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
2921 vst1q_lane_f32(to + stride * 0, from, 0);
2922 vst1q_lane_f32(to + stride * 1, from, 1);
2923 vst1q_lane_f32(to + stride * 2, from, 2);
2924 vst1q_lane_f32(to + stride * 3, from, 3);
2925}
2926template <>
2927EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride) {
2928 for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
2929}
2930template <>
2931EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride) {
2932 vst1_lane_s8(to + stride * 0, from, 0);
2933 vst1_lane_s8(to + stride * 1, from, 1);
2934 vst1_lane_s8(to + stride * 2, from, 2);
2935 vst1_lane_s8(to + stride * 3, from, 3);
2936 vst1_lane_s8(to + stride * 4, from, 4);
2937 vst1_lane_s8(to + stride * 5, from, 5);
2938 vst1_lane_s8(to + stride * 6, from, 6);
2939 vst1_lane_s8(to + stride * 7, from, 7);
2940}
2941template <>
2942EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
2943 Index stride) {
2944 vst1q_lane_s8(to + stride * 0, from, 0);
2945 vst1q_lane_s8(to + stride * 1, from, 1);
2946 vst1q_lane_s8(to + stride * 2, from, 2);
2947 vst1q_lane_s8(to + stride * 3, from, 3);
2948 vst1q_lane_s8(to + stride * 4, from, 4);
2949 vst1q_lane_s8(to + stride * 5, from, 5);
2950 vst1q_lane_s8(to + stride * 6, from, 6);
2951 vst1q_lane_s8(to + stride * 7, from, 7);
2952 vst1q_lane_s8(to + stride * 8, from, 8);
2953 vst1q_lane_s8(to + stride * 9, from, 9);
2954 vst1q_lane_s8(to + stride * 10, from, 10);
2955 vst1q_lane_s8(to + stride * 11, from, 11);
2956 vst1q_lane_s8(to + stride * 12, from, 12);
2957 vst1q_lane_s8(to + stride * 13, from, 13);
2958 vst1q_lane_s8(to + stride * 14, from, 14);
2959 vst1q_lane_s8(to + stride * 15, from, 15);
2960}
2961template <>
2962EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from,
2963 Index stride) {
2964 for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
2965}
2966template <>
2967EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from,
2968 Index stride) {
2969 vst1_lane_u8(to + stride * 0, from, 0);
2970 vst1_lane_u8(to + stride * 1, from, 1);
2971 vst1_lane_u8(to + stride * 2, from, 2);
2972 vst1_lane_u8(to + stride * 3, from, 3);
2973 vst1_lane_u8(to + stride * 4, from, 4);
2974 vst1_lane_u8(to + stride * 5, from, 5);
2975 vst1_lane_u8(to + stride * 6, from, 6);
2976 vst1_lane_u8(to + stride * 7, from, 7);
2977}
2978template <>
2979EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
2980 Index stride) {
2981 vst1q_lane_u8(to + stride * 0, from, 0);
2982 vst1q_lane_u8(to + stride * 1, from, 1);
2983 vst1q_lane_u8(to + stride * 2, from, 2);
2984 vst1q_lane_u8(to + stride * 3, from, 3);
2985 vst1q_lane_u8(to + stride * 4, from, 4);
2986 vst1q_lane_u8(to + stride * 5, from, 5);
2987 vst1q_lane_u8(to + stride * 6, from, 6);
2988 vst1q_lane_u8(to + stride * 7, from, 7);
2989 vst1q_lane_u8(to + stride * 8, from, 8);
2990 vst1q_lane_u8(to + stride * 9, from, 9);
2991 vst1q_lane_u8(to + stride * 10, from, 10);
2992 vst1q_lane_u8(to + stride * 11, from, 11);
2993 vst1q_lane_u8(to + stride * 12, from, 12);
2994 vst1q_lane_u8(to + stride * 13, from, 13);
2995 vst1q_lane_u8(to + stride * 14, from, 14);
2996 vst1q_lane_u8(to + stride * 15, from, 15);
2997}
2998template <>
2999EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from,
3000 Index stride) {
3001 vst1_lane_s16(to + stride * 0, from, 0);
3002 vst1_lane_s16(to + stride * 1, from, 1);
3003 vst1_lane_s16(to + stride * 2, from, 2);
3004 vst1_lane_s16(to + stride * 3, from, 3);
3005}
3006template <>
3007EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
3008 Index stride) {
3009 vst1q_lane_s16(to + stride * 0, from, 0);
3010 vst1q_lane_s16(to + stride * 1, from, 1);
3011 vst1q_lane_s16(to + stride * 2, from, 2);
3012 vst1q_lane_s16(to + stride * 3, from, 3);
3013 vst1q_lane_s16(to + stride * 4, from, 4);
3014 vst1q_lane_s16(to + stride * 5, from, 5);
3015 vst1q_lane_s16(to + stride * 6, from, 6);
3016 vst1q_lane_s16(to + stride * 7, from, 7);
3017}
3018template <>
3019EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from,
3020 Index stride) {
3021 vst1_lane_u16(to + stride * 0, from, 0);
3022 vst1_lane_u16(to + stride * 1, from, 1);
3023 vst1_lane_u16(to + stride * 2, from, 2);
3024 vst1_lane_u16(to + stride * 3, from, 3);
3025}
3026template <>
3027EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
3028 Index stride) {
3029 vst1q_lane_u16(to + stride * 0, from, 0);
3030 vst1q_lane_u16(to + stride * 1, from, 1);
3031 vst1q_lane_u16(to + stride * 2, from, 2);
3032 vst1q_lane_u16(to + stride * 3, from, 3);
3033 vst1q_lane_u16(to + stride * 4, from, 4);
3034 vst1q_lane_u16(to + stride * 5, from, 5);
3035 vst1q_lane_u16(to + stride * 6, from, 6);
3036 vst1q_lane_u16(to + stride * 7, from, 7);
3037}
3038template <>
3039EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from,
3040 Index stride) {
3041 vst1_lane_s32(to + stride * 0, from, 0);
3042 vst1_lane_s32(to + stride * 1, from, 1);
3043}
3044template <>
3045EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
3046 Index stride) {
3047 vst1q_lane_s32(to + stride * 0, from, 0);
3048 vst1q_lane_s32(to + stride * 1, from, 1);
3049 vst1q_lane_s32(to + stride * 2, from, 2);
3050 vst1q_lane_s32(to + stride * 3, from, 3);
3051}
3052template <>
3053EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from,
3054 Index stride) {
3055 vst1_lane_u32(to + stride * 0, from, 0);
3056 vst1_lane_u32(to + stride * 1, from, 1);
3057}
3058template <>
3059EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
3060 Index stride) {
3061 vst1q_lane_u32(to + stride * 0, from, 0);
3062 vst1q_lane_u32(to + stride * 1, from, 1);
3063 vst1q_lane_u32(to + stride * 2, from, 2);
3064 vst1q_lane_u32(to + stride * 3, from, 3);
3065}
3066template <>
3067EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
3068 Index stride) {
3069 vst1q_lane_s64(to + stride * 0, from, 0);
3070 vst1q_lane_s64(to + stride * 1, from, 1);
3071}
3072template <>
3073EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
3074 Index stride) {
3075 vst1q_lane_u64(to + stride * 0, from, 0);
3076 vst1q_lane_u64(to + stride * 1, from, 1);
3077}
3078
3079template <>
3080EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
3081 EIGEN_ARM_PREFETCH(addr);
3082}
3083template <>
3084EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
3085 EIGEN_ARM_PREFETCH(addr);
3086}
3087template <>
3088EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
3089 EIGEN_ARM_PREFETCH(addr);
3090}
3091template <>
3092EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
3093 EIGEN_ARM_PREFETCH(addr);
3094}
3095template <>
3096EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
3097 EIGEN_ARM_PREFETCH(addr);
3098}
3099template <>
3100EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
3101 EIGEN_ARM_PREFETCH(addr);
3102}
3103template <>
3104EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
3105 EIGEN_ARM_PREFETCH(addr);
3106}
3107template <>
3108EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
3109 EIGEN_ARM_PREFETCH(addr);
3110}
3111template <>
3112EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
3113 EIGEN_ARM_PREFETCH(addr);
3114}
3115
3116template <>
3117EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) {
3118 return vget_lane_f32(a, 0);
3119}
3120template <>
3121EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
3122 return vgetq_lane_f32(a, 0);
3123}
3124template <>
3125EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) {
3126 return static_cast<int8_t>(a & 0xff);
3127}
3128template <>
3129EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) {
3130 return vget_lane_s8(a, 0);
3131}
3132template <>
3133EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
3134 return vgetq_lane_s8(a, 0);
3135}
3136template <>
3137EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) {
3138 return static_cast<uint8_t>(a & 0xff);
3139}
3140template <>
3141EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) {
3142 return vget_lane_u8(a, 0);
3143}
3144template <>
3145EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
3146 return vgetq_lane_u8(a, 0);
3147}
3148template <>
3149EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) {
3150 return vget_lane_s16(a, 0);
3151}
3152template <>
3153EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
3154 return vgetq_lane_s16(a, 0);
3155}
3156template <>
3157EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) {
3158 return vget_lane_u16(a, 0);
3159}
3160template <>
3161EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
3162 return vgetq_lane_u16(a, 0);
3163}
3164template <>
3165EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) {
3166 return vget_lane_s32(a, 0);
3167}
3168template <>
3169EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
3170 return vgetq_lane_s32(a, 0);
3171}
3172template <>
3173EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) {
3174 return vget_lane_u32(a, 0);
3175}
3176template <>
3177EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
3178 return vgetq_lane_u32(a, 0);
3179}
3180template <>
3181EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
3182 return vgetq_lane_s64(a, 0);
3183}
3184template <>
3185EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
3186 return vgetq_lane_u64(a, 0);
3187}
3188
3189template <>
3190EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) {
3191 return vrev64_f32(a);
3192}
3193template <>
3194EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
3195 const float32x4_t a_r64 = vrev64q_f32(a);
3196 return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
3197}
3198template <>
3199EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) {
3200 return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3201}
3202template <>
3203EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) {
3204 return vrev64_s8(a);
3205}
3206template <>
3207EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
3208 const int8x16_t a_r64 = vrev64q_s8(a);
3209 return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
3210}
3211template <>
3212EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) {
3213 return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
3214}
3215template <>
3216EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) {
3217 return vrev64_u8(a);
3218}
3219template <>
3220EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
3221 const uint8x16_t a_r64 = vrev64q_u8(a);
3222 return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
3223}
3224template <>
3225EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) {
3226 return vrev64_s16(a);
3227}
3228template <>
3229EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
3230 const int16x8_t a_r64 = vrev64q_s16(a);
3231 return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
3232}
3233template <>
3234EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) {
3235 return vrev64_u16(a);
3236}
3237template <>
3238EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
3239 const uint16x8_t a_r64 = vrev64q_u16(a);
3240 return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
3241}
3242template <>
3243EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) {
3244 return vrev64_s32(a);
3245}
3246template <>
3247EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
3248 const int32x4_t a_r64 = vrev64q_s32(a);
3249 return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
3250}
3251template <>
3252EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) {
3253 return vrev64_u32(a);
3254}
3255template <>
3256EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
3257 const uint32x4_t a_r64 = vrev64q_u32(a);
3258 return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
3259}
3260template <>
3261EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
3262 return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
3263}
3264template <>
3265EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
3266 return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
3267}
3268
3269template <>
3270EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) {
3271 return vabs_f32(a);
3272}
3273template <>
3274EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
3275 return vabsq_f32(a);
3276}
3277template <>
3278EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a) {
3279 return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3280}
3281template <>
3282EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) {
3283 return vabs_s8(a);
3284}
3285template <>
3286EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
3287 return vabsq_s8(a);
3288}
3289template <>
3290EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) {
3291 return a;
3292}
3293template <>
3294EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) {
3295 return a;
3296}
3297template <>
3298EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
3299 return a;
3300}
3301template <>
3302EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) {
3303 return vabs_s16(a);
3304}
3305template <>
3306EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
3307 return vabsq_s16(a);
3308}
3309template <>
3310EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) {
3311 return a;
3312}
3313template <>
3314EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
3315 return a;
3316}
3317template <>
3318EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) {
3319 return vabs_s32(a);
3320}
3321template <>
3322EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
3323 return vabsq_s32(a);
3324}
3325template <>
3326EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) {
3327 return a;
3328}
3329template <>
3330EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
3331 return a;
3332}
3333template <>
3334EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
3335#if EIGEN_ARCH_ARM64
3336 return vabsq_s64(a);
3337#else
3338 return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
3339#endif
3340}
3341template <>
3342EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
3343 return a;
3344}
3345
3346template <>
3347EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
3348 return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
3349}
3350template <>
3351EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
3352 return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
3353}
3354
3355template <>
3356EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent) {
3357 return pfrexp_generic(a, exponent);
3358}
3359template <>
3360EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
3361 return pfrexp_generic(a, exponent);
3362}
3363
3364template <>
3365EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent) {
3366 return pldexp_generic(a, exponent);
3367}
3368template <>
3369EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
3370 return pldexp_generic(a, exponent);
3371}
3372
3373#if EIGEN_ARCH_ARM64
3374template <>
3375EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
3376 return vaddv_f32(a);
3377}
3378template <>
3379EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
3380 return vaddvq_f32(a);
3381}
3382#else
3383template <>
3384EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
3385 return vget_lane_f32(vpadd_f32(a, a), 0);
3386}
3387template <>
3388EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
3389 const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
3390 return vget_lane_f32(vpadd_f32(sum, sum), 0);
3391}
3392#endif
3393template <>
3394EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a) {
3395 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3396 int8x8_t sum = vpadd_s8(a_dup, a_dup);
3397 sum = vpadd_s8(sum, sum);
3398 return vget_lane_s8(sum, 0);
3399}
3400#if EIGEN_ARCH_ARM64
3401template <>
3402EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
3403 return vaddv_s8(a);
3404}
3405template <>
3406EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
3407 return vaddvq_s8(a);
3408}
3409#else
3410template <>
3411EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
3412 int8x8_t sum = vpadd_s8(a, a);
3413 sum = vpadd_s8(sum, sum);
3414 sum = vpadd_s8(sum, sum);
3415 return vget_lane_s8(sum, 0);
3416}
3417template <>
3418EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
3419 int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
3420 sum = vpadd_s8(sum, sum);
3421 sum = vpadd_s8(sum, sum);
3422 sum = vpadd_s8(sum, sum);
3423 return vget_lane_s8(sum, 0);
3424}
3425#endif
3426template <>
3427EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a) {
3428 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3429 uint8x8_t sum = vpadd_u8(a_dup, a_dup);
3430 sum = vpadd_u8(sum, sum);
3431 return vget_lane_u8(sum, 0);
3432}
3433#if EIGEN_ARCH_ARM64
3434template <>
3435EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
3436 return vaddv_u8(a);
3437}
3438template <>
3439EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
3440 return vaddvq_u8(a);
3441}
3442template <>
3443EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
3444 return vaddv_s16(a);
3445}
3446template <>
3447EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
3448 return vaddvq_s16(a);
3449}
3450template <>
3451EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
3452 return vaddv_u16(a);
3453}
3454template <>
3455EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
3456 return vaddvq_u16(a);
3457}
3458template <>
3459EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
3460 return vaddv_s32(a);
3461}
3462template <>
3463EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
3464 return vaddvq_s32(a);
3465}
3466template <>
3467EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
3468 return vaddv_u32(a);
3469}
3470template <>
3471EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
3472 return vaddvq_u32(a);
3473}
3474template <>
3475EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
3476 return vaddvq_s64(a);
3477}
3478template <>
3479EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
3480 return vaddvq_u64(a);
3481}
3482#else
3483template <>
3484EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
3485 uint8x8_t sum = vpadd_u8(a, a);
3486 sum = vpadd_u8(sum, sum);
3487 sum = vpadd_u8(sum, sum);
3488 return vget_lane_u8(sum, 0);
3489}
3490template <>
3491EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
3492 uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
3493 sum = vpadd_u8(sum, sum);
3494 sum = vpadd_u8(sum, sum);
3495 sum = vpadd_u8(sum, sum);
3496 return vget_lane_u8(sum, 0);
3497}
3498template <>
3499EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
3500 const int16x4_t sum = vpadd_s16(a, a);
3501 return vget_lane_s16(vpadd_s16(sum, sum), 0);
3502}
3503template <>
3504EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
3505 int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
3506 sum = vpadd_s16(sum, sum);
3507 sum = vpadd_s16(sum, sum);
3508 return vget_lane_s16(sum, 0);
3509}
3510template <>
3511EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
3512 const uint16x4_t sum = vpadd_u16(a, a);
3513 return vget_lane_u16(vpadd_u16(sum, sum), 0);
3514}
3515template <>
3516EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
3517 uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
3518 sum = vpadd_u16(sum, sum);
3519 sum = vpadd_u16(sum, sum);
3520 return vget_lane_u16(sum, 0);
3521}
3522template <>
3523EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
3524 return vget_lane_s32(vpadd_s32(a, a), 0);
3525}
3526template <>
3527EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
3528 const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
3529 return vget_lane_s32(vpadd_s32(sum, sum), 0);
3530}
3531template <>
3532EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
3533 return vget_lane_u32(vpadd_u32(a, a), 0);
3534}
3535template <>
3536EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
3537 const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
3538 return vget_lane_u32(vpadd_u32(sum, sum), 0);
3539}
3540template <>
3541EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
3542 return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
3543}
3544template <>
3545EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
3546 return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
3547}
3548#endif
3549
3550template <>
3551EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
3552 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
3553}
3554template <>
3555EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
3556 return vadd_s8(vget_high_s8(a), vget_low_s8(a));
3557}
3558template <>
3559EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
3560 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
3561}
3562template <>
3563EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
3564 return vadd_u8(vget_high_u8(a), vget_low_u8(a));
3565}
3566template <>
3567EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
3568 return vadd_s16(vget_high_s16(a), vget_low_s16(a));
3569}
3570template <>
3571EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
3572 return vadd_u16(vget_high_u16(a), vget_low_u16(a));
3573}
3574
3575// Other reduction functions:
3576// mul
3577template <>
3578EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a) {
3579 return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
3580}
3581template <>
3582EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
3583 return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
3584}
3585template <>
3586EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a) {
3587 int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
3588 prod = vmul_s8(prod, vrev16_s8(prod));
3589 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
3590}
3591template <>
3592EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a) {
3593 int8x8_t prod = vmul_s8(a, vrev16_s8(a));
3594 prod = vmul_s8(prod, vrev32_s8(prod));
3595 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
3596}
3597template <>
3598EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
3599 return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
3600}
3601template <>
3602EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a) {
3603 uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
3604 prod = vmul_u8(prod, vrev16_u8(prod));
3605 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
3606}
3607template <>
3608EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a) {
3609 uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
3610 prod = vmul_u8(prod, vrev32_u8(prod));
3611 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
3612}
3613template <>
3614EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
3615 return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
3616}
3617template <>
3618EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a) {
3619 const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
3620 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3621}
3622template <>
3623EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
3624 int16x4_t prod;
3625
3626 // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3627 prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
3628 // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3629 prod = vmul_s16(prod, vrev32_s16(prod));
3630 // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3631 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3632}
3633template <>
3634EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a) {
3635 const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
3636 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3637}
3638template <>
3639EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
3640 uint16x4_t prod;
3641
3642 // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3643 prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
3644 // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3645 prod = vmul_u16(prod, vrev32_u16(prod));
3646 // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3647 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3648}
3649template <>
3650EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a) {
3651 return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
3652}
3653template <>
3654EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
3655 return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
3656}
3657template <>
3658EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a) {
3659 return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
3660}
3661template <>
3662EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
3663 return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
3664}
3665template <>
3666EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
3667 return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
3668}
3669template <>
3670EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
3671 return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
3672}
3673
3674// min
3675#if EIGEN_ARCH_ARM64
3676template <>
3677EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
3678 return vminv_f32(a);
3679}
3680template <>
3681EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
3682 return vminvq_f32(a);
3683}
3684#else
3685template <>
3686EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
3687 return vget_lane_f32(vpmin_f32(a, a), 0);
3688}
3689template <>
3690EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
3691 const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
3692 return vget_lane_f32(vpmin_f32(min, min), 0);
3693}
3694#endif
3695template <>
3696EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a) {
3697 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3698 int8x8_t min = vpmin_s8(a_dup, a_dup);
3699 min = vpmin_s8(min, min);
3700 return vget_lane_s8(min, 0);
3701}
3702#if EIGEN_ARCH_ARM64
3703template <>
3704EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
3705 return vminv_s8(a);
3706}
3707template <>
3708EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
3709 return vminvq_s8(a);
3710}
3711#else
3712template <>
3713EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
3714 int8x8_t min = vpmin_s8(a, a);
3715 min = vpmin_s8(min, min);
3716 min = vpmin_s8(min, min);
3717 return vget_lane_s8(min, 0);
3718}
3719template <>
3720EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
3721 int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
3722 min = vpmin_s8(min, min);
3723 min = vpmin_s8(min, min);
3724 min = vpmin_s8(min, min);
3725 return vget_lane_s8(min, 0);
3726}
3727#endif
3728template <>
3729EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a) {
3730 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3731 uint8x8_t min = vpmin_u8(a_dup, a_dup);
3732 min = vpmin_u8(min, min);
3733 return vget_lane_u8(min, 0);
3734}
3735#if EIGEN_ARCH_ARM64
3736template <>
3737EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
3738 return vminv_u8(a);
3739}
3740template <>
3741EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
3742 return vminvq_u8(a);
3743}
3744template <>
3745EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
3746 return vminv_s16(a);
3747}
3748template <>
3749EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
3750 return vminvq_s16(a);
3751}
3752template <>
3753EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
3754 return vminv_u16(a);
3755}
3756template <>
3757EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
3758 return vminvq_u16(a);
3759}
3760template <>
3761EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
3762 return vminv_s32(a);
3763}
3764template <>
3765EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
3766 return vminvq_s32(a);
3767}
3768template <>
3769EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
3770 return vminv_u32(a);
3771}
3772template <>
3773EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
3774 return vminvq_u32(a);
3775}
3776#else
3777template <>
3778EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
3779 uint8x8_t min = vpmin_u8(a, a);
3780 min = vpmin_u8(min, min);
3781 min = vpmin_u8(min, min);
3782 return vget_lane_u8(min, 0);
3783}
3784template <>
3785EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
3786 uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
3787 min = vpmin_u8(min, min);
3788 min = vpmin_u8(min, min);
3789 min = vpmin_u8(min, min);
3790 return vget_lane_u8(min, 0);
3791}
3792template <>
3793EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
3794 const int16x4_t min = vpmin_s16(a, a);
3795 return vget_lane_s16(vpmin_s16(min, min), 0);
3796}
3797template <>
3798EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
3799 int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
3800 min = vpmin_s16(min, min);
3801 min = vpmin_s16(min, min);
3802 return vget_lane_s16(min, 0);
3803}
3804template <>
3805EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
3806 const uint16x4_t min = vpmin_u16(a, a);
3807 return vget_lane_u16(vpmin_u16(min, min), 0);
3808}
3809template <>
3810EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
3811 uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
3812 min = vpmin_u16(min, min);
3813 min = vpmin_u16(min, min);
3814 return vget_lane_u16(min, 0);
3815}
3816template <>
3817EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
3818 return vget_lane_s32(vpmin_s32(a, a), 0);
3819}
3820template <>
3821EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
3822 const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
3823 return vget_lane_s32(vpmin_s32(min, min), 0);
3824}
3825template <>
3826EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
3827 return vget_lane_u32(vpmin_u32(a, a), 0);
3828}
3829template <>
3830EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
3831 const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
3832 return vget_lane_u32(vpmin_u32(min, min), 0);
3833}
3834#endif
3835template <>
3836EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
3837 return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3838}
3839template <>
3840EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
3841 return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3842}
3843
3844// max
3845#if EIGEN_ARCH_ARM64
3846template <>
3847EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
3848 return vmaxv_f32(a);
3849}
3850template <>
3851EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
3852 return vmaxvq_f32(a);
3853}
3854#else
3855template <>
3856EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
3857 return vget_lane_f32(vpmax_f32(a, a), 0);
3858}
3859template <>
3860EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
3861 const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
3862 return vget_lane_f32(vpmax_f32(max, max), 0);
3863}
3864#endif
3865template <>
3866EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a) {
3867 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3868 int8x8_t max = vpmax_s8(a_dup, a_dup);
3869 max = vpmax_s8(max, max);
3870 return vget_lane_s8(max, 0);
3871}
3872#if EIGEN_ARCH_ARM64
3873template <>
3874EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
3875 return vmaxv_s8(a);
3876}
3877template <>
3878EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
3879 return vmaxvq_s8(a);
3880}
3881#else
3882template <>
3883EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
3884 int8x8_t max = vpmax_s8(a, a);
3885 max = vpmax_s8(max, max);
3886 max = vpmax_s8(max, max);
3887 return vget_lane_s8(max, 0);
3888}
3889template <>
3890EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
3891 int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
3892 max = vpmax_s8(max, max);
3893 max = vpmax_s8(max, max);
3894 max = vpmax_s8(max, max);
3895 return vget_lane_s8(max, 0);
3896}
3897#endif
3898template <>
3899EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a) {
3900 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3901 uint8x8_t max = vpmax_u8(a_dup, a_dup);
3902 max = vpmax_u8(max, max);
3903 return vget_lane_u8(max, 0);
3904}
3905#if EIGEN_ARCH_ARM64
3906template <>
3907EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
3908 return vmaxv_u8(a);
3909}
3910template <>
3911EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
3912 return vmaxvq_u8(a);
3913}
3914template <>
3915EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
3916 return vmaxv_s16(a);
3917}
3918template <>
3919EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
3920 return vmaxvq_s16(a);
3921}
3922template <>
3923EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
3924 return vmaxv_u16(a);
3925}
3926template <>
3927EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
3928 return vmaxvq_u16(a);
3929}
3930template <>
3931EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
3932 return vmaxv_s32(a);
3933}
3934template <>
3935EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
3936 return vmaxvq_s32(a);
3937}
3938template <>
3939EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
3940 return vmaxv_u32(a);
3941}
3942template <>
3943EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
3944 return vmaxvq_u32(a);
3945}
3946#else
3947template <>
3948EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
3949 uint8x8_t max = vpmax_u8(a, a);
3950 max = vpmax_u8(max, max);
3951 max = vpmax_u8(max, max);
3952 return vget_lane_u8(max, 0);
3953}
3954template <>
3955EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
3956 uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
3957 max = vpmax_u8(max, max);
3958 max = vpmax_u8(max, max);
3959 max = vpmax_u8(max, max);
3960 return vget_lane_u8(max, 0);
3961}
3962template <>
3963EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
3964 const int16x4_t max = vpmax_s16(a, a);
3965 return vget_lane_s16(vpmax_s16(max, max), 0);
3966}
3967template <>
3968EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
3969 int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
3970 max = vpmax_s16(max, max);
3971 max = vpmax_s16(max, max);
3972 return vget_lane_s16(max, 0);
3973}
3974template <>
3975EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
3976 const uint16x4_t max = vpmax_u16(a, a);
3977 return vget_lane_u16(vpmax_u16(max, max), 0);
3978}
3979template <>
3980EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
3981 uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
3982 max = vpmax_u16(max, max);
3983 max = vpmax_u16(max, max);
3984 return vget_lane_u16(max, 0);
3985}
3986template <>
3987EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
3988 return vget_lane_s32(vpmax_s32(a, a), 0);
3989}
3990template <>
3991EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
3992 const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
3993 return vget_lane_s32(vpmax_s32(max, max), 0);
3994}
3995template <>
3996EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
3997 return vget_lane_u32(vpmax_u32(a, a), 0);
3998}
3999template <>
4000EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
4001 const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
4002 return vget_lane_u32(vpmax_u32(max, max), 0);
4003}
4004#endif
4005template <>
4006EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
4007 return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
4008}
4009template <>
4010EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
4011 return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
4012}
4013
4014template <>
4015EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
4016 uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
4017 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
4018}
4019
4020// Helpers for ptranspose.
4021namespace detail {
4022
4023template <typename Packet>
4024void zip_in_place(Packet& p1, Packet& p2);
4025
4026template <>
4027EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
4028 const float32x2x2_t tmp = vzip_f32(p1, p2);
4029 p1 = tmp.val[0];
4030 p2 = tmp.val[1];
4031}
4032
4033template <>
4034EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
4035 const float32x4x2_t tmp = vzipq_f32(p1, p2);
4036 p1 = tmp.val[0];
4037 p2 = tmp.val[1];
4038}
4039
4040template <>
4041EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
4042 const int8x8x2_t tmp = vzip_s8(p1, p2);
4043 p1 = tmp.val[0];
4044 p2 = tmp.val[1];
4045}
4046
4047template <>
4048EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
4049 const int8x16x2_t tmp = vzipq_s8(p1, p2);
4050 p1 = tmp.val[0];
4051 p2 = tmp.val[1];
4052}
4053
4054template <>
4055EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
4056 const uint8x8x2_t tmp = vzip_u8(p1, p2);
4057 p1 = tmp.val[0];
4058 p2 = tmp.val[1];
4059}
4060
4061template <>
4062EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
4063 const uint8x16x2_t tmp = vzipq_u8(p1, p2);
4064 p1 = tmp.val[0];
4065 p2 = tmp.val[1];
4066}
4067
4068template <>
4069EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
4070 const int32x2x2_t tmp = vzip_s32(p1, p2);
4071 p1 = tmp.val[0];
4072 p2 = tmp.val[1];
4073}
4074
4075template <>
4076EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
4077 const int32x4x2_t tmp = vzipq_s32(p1, p2);
4078 p1 = tmp.val[0];
4079 p2 = tmp.val[1];
4080}
4081
4082template <>
4083EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
4084 const uint32x2x2_t tmp = vzip_u32(p1, p2);
4085 p1 = tmp.val[0];
4086 p2 = tmp.val[1];
4087}
4088
4089template <>
4090EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
4091 const uint32x4x2_t tmp = vzipq_u32(p1, p2);
4092 p1 = tmp.val[0];
4093 p2 = tmp.val[1];
4094}
4095
4096template <>
4097EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
4098 const int16x4x2_t tmp = vzip_s16(p1, p2);
4099 p1 = tmp.val[0];
4100 p2 = tmp.val[1];
4101}
4102
4103template <>
4104EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
4105 const int16x8x2_t tmp = vzipq_s16(p1, p2);
4106 p1 = tmp.val[0];
4107 p2 = tmp.val[1];
4108}
4109
4110template <>
4111EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
4112 const uint16x4x2_t tmp = vzip_u16(p1, p2);
4113 p1 = tmp.val[0];
4114 p2 = tmp.val[1];
4115}
4116
4117template <>
4118EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
4119 const uint16x8x2_t tmp = vzipq_u16(p1, p2);
4120 p1 = tmp.val[0];
4121 p2 = tmp.val[1];
4122}
4123
4124template <typename Packet>
4125EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
4126 zip_in_place(kernel.packet[0], kernel.packet[1]);
4127}
4128
4129template <typename Packet>
4130EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
4131 zip_in_place(kernel.packet[0], kernel.packet[2]);
4132 zip_in_place(kernel.packet[1], kernel.packet[3]);
4133 zip_in_place(kernel.packet[0], kernel.packet[1]);
4134 zip_in_place(kernel.packet[2], kernel.packet[3]);
4135}
4136
4137template <typename Packet>
4138EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
4139 zip_in_place(kernel.packet[0], kernel.packet[4]);
4140 zip_in_place(kernel.packet[1], kernel.packet[5]);
4141 zip_in_place(kernel.packet[2], kernel.packet[6]);
4142 zip_in_place(kernel.packet[3], kernel.packet[7]);
4143
4144 zip_in_place(kernel.packet[0], kernel.packet[2]);
4145 zip_in_place(kernel.packet[1], kernel.packet[3]);
4146 zip_in_place(kernel.packet[4], kernel.packet[6]);
4147 zip_in_place(kernel.packet[5], kernel.packet[7]);
4148
4149 zip_in_place(kernel.packet[0], kernel.packet[1]);
4150 zip_in_place(kernel.packet[2], kernel.packet[3]);
4151 zip_in_place(kernel.packet[4], kernel.packet[5]);
4152 zip_in_place(kernel.packet[6], kernel.packet[7]);
4153}
4154
4155template <typename Packet>
4156EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
4157 EIGEN_UNROLL_LOOP
4158 for (int i = 0; i < 4; ++i) {
4159 const int m = (1 << i);
4160 EIGEN_UNROLL_LOOP
4161 for (int j = 0; j < m; ++j) {
4162 const int n = (1 << (3 - i));
4163 EIGEN_UNROLL_LOOP
4164 for (int k = 0; k < n; ++k) {
4165 const int idx = 2 * j * n + k;
4166 zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
4167 }
4168 }
4169 }
4170}
4171
4172} // namespace detail
4173
4174EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
4175 detail::ptranspose_impl(kernel);
4176}
4177EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
4178 detail::ptranspose_impl(kernel);
4179}
4180
4181EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
4182 const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
4183 const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
4184
4185 const int8x8x2_t zip8 = vzip_s8(a, b);
4186 const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
4187
4188 kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
4189 kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
4190 kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
4191 kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
4192}
4193EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
4194 detail::ptranspose_impl(kernel);
4195}
4196EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
4197 detail::ptranspose_impl(kernel);
4198}
4199EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
4200 detail::ptranspose_impl(kernel);
4201}
4202EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
4203 detail::ptranspose_impl(kernel);
4204}
4205EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
4206 detail::ptranspose_impl(kernel);
4207}
4208
4209EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
4210 const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
4211 const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
4212
4213 const uint8x8x2_t zip8 = vzip_u8(a, b);
4214 const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
4215
4216 kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
4217 kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
4218 kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
4219 kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
4220}
4221EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
4222 detail::ptranspose_impl(kernel);
4223}
4224EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
4225 detail::ptranspose_impl(kernel);
4226}
4227EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
4228 detail::ptranspose_impl(kernel);
4229}
4230EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
4231 detail::ptranspose_impl(kernel);
4232}
4233EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
4234 detail::ptranspose_impl(kernel);
4235}
4236
4237EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
4238 detail::ptranspose_impl(kernel);
4239}
4240EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
4241 detail::ptranspose_impl(kernel);
4242}
4243EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
4244 detail::ptranspose_impl(kernel);
4245}
4246
4247EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
4248 detail::ptranspose_impl(kernel);
4249}
4250EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
4251 detail::ptranspose_impl(kernel);
4252}
4253EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
4254 detail::ptranspose_impl(kernel);
4255}
4256
4257EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
4258 detail::ptranspose_impl(kernel);
4259}
4260EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
4261 detail::ptranspose_impl(kernel);
4262}
4263EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
4264 detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
4265}
4266EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
4267 detail::ptranspose_impl(kernel);
4268}
4269
4270EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
4271#if EIGEN_ARCH_ARM64
4272 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
4273 kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
4274 kernel.packet[0] = tmp1;
4275#else
4276 const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
4277 {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
4278
4279 kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
4280 kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
4281#endif
4282}
4283EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
4284#if EIGEN_ARCH_ARM64
4285 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
4286 kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
4287 kernel.packet[0] = tmp1;
4288#else
4289 const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
4290 {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
4291
4292 kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
4293 kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
4294#endif
4295}
4296
4297template <>
4298EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(const Packet2f& mask, const Packet2f& a, const Packet2f& b) {
4299 return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
4300}
4301template <>
4302EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
4303 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
4304}
4305template <>
4306EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) {
4307 return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
4308}
4309template <>
4310EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
4311 return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
4312}
4313template <>
4314EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) {
4315 return vbsl_u8(mask, a, b);
4316}
4317template <>
4318EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
4319 const Packet16uc& b) {
4320 return vbslq_u8(mask, a, b);
4321}
4322template <>
4323EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) {
4324 return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
4325}
4326template <>
4327EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
4328 return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
4329}
4330template <>
4331EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) {
4332 return vbsl_u16(mask, a, b);
4333}
4334template <>
4335EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
4336 return vbslq_u16(mask, a, b);
4337}
4338template <>
4339EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) {
4340 return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
4341}
4342template <>
4343EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
4344 return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
4345}
4346template <>
4347EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) {
4348 return vbsl_u32(mask, a, b);
4349}
4350template <>
4351EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
4352 return vbslq_u32(mask, a, b);
4353}
4354template <>
4355EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
4356 return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
4357}
4358template <>
4359EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
4360 return vbslq_u64(mask, a, b);
4361}
4362
4363// Use armv8 rounding intinsics if available.
4364#if EIGEN_ARCH_ARMV8
4365template <>
4366EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
4367 return vrndn_f32(a);
4368}
4369
4370template <>
4371EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
4372 return vrndnq_f32(a);
4373}
4374
4375template <>
4376EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
4377 return vrndm_f32(a);
4378}
4379
4380template <>
4381EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
4382 return vrndmq_f32(a);
4383}
4384
4385template <>
4386EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
4387 return vrndp_f32(a);
4388}
4389
4390template <>
4391EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
4392 return vrndpq_f32(a);
4393}
4394
4395template <>
4396EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(const Packet2f& a) {
4397 return vrnda_f32(a);
4398}
4399
4400template <>
4401EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
4402 return vrndaq_f32(a);
4403}
4404
4405template <>
4406EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(const Packet2f& a) {
4407 return vrnd_f32(a);
4408}
4409
4410template <>
4411EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
4412 return vrndq_f32(a);
4413}
4414#endif
4415
4422template <>
4423EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
4424 uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
4425 uint8x8_t res = vdup_n_u8(0);
4426 uint8x8_t add = vdup_n_u8(0x8);
4427 for (int i = 0; i < 4; i++) {
4428 const uint8x8_t temp = vorr_u8(res, add);
4429 res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
4430 add = vshr_n_u8(add, 1);
4431 }
4432 return vget_lane_u32(vreinterpret_u32_u8(res), 0);
4433}
4435template <>
4436EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
4437 uint8x8_t res = vdup_n_u8(0);
4438 uint8x8_t add = vdup_n_u8(0x8);
4439 for (int i = 0; i < 4; i++) {
4440 const uint8x8_t temp = vorr_u8(res, add);
4441 res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
4442 add = vshr_n_u8(add, 1);
4443 }
4444 return res;
4445}
4447template <>
4448EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
4449 uint8x16_t res = vdupq_n_u8(0);
4450 uint8x16_t add = vdupq_n_u8(0x8);
4451 for (int i = 0; i < 4; i++) {
4452 const uint8x16_t temp = vorrq_u8(res, add);
4453 res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
4454 add = vshrq_n_u8(add, 1);
4455 }
4456 return res;
4457}
4459template <>
4460EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
4461 uint16x4_t res = vdup_n_u16(0);
4462 uint16x4_t add = vdup_n_u16(0x80);
4463 for (int i = 0; i < 8; i++) {
4464 const uint16x4_t temp = vorr_u16(res, add);
4465 res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
4466 add = vshr_n_u16(add, 1);
4467 }
4468 return res;
4469}
4471template <>
4472EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
4473 uint16x8_t res = vdupq_n_u16(0);
4474 uint16x8_t add = vdupq_n_u16(0x80);
4475 for (int i = 0; i < 8; i++) {
4476 const uint16x8_t temp = vorrq_u16(res, add);
4477 res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
4478 add = vshrq_n_u16(add, 1);
4479 }
4480 return res;
4481}
4483template <>
4484EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
4485 uint32x2_t res = vdup_n_u32(0);
4486 uint32x2_t add = vdup_n_u32(0x8000);
4487 for (int i = 0; i < 16; i++) {
4488 const uint32x2_t temp = vorr_u32(res, add);
4489 res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
4490 add = vshr_n_u32(add, 1);
4491 }
4492 return res;
4493}
4495template <>
4496EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
4497 uint32x4_t res = vdupq_n_u32(0);
4498 uint32x4_t add = vdupq_n_u32(0x8000);
4499 for (int i = 0; i < 16; i++) {
4500 const uint32x4_t temp = vorrq_u32(res, add);
4501 res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
4502 add = vshrq_n_u32(add, 1);
4503 }
4504 return res;
4505}
4506
4507EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
4508 // Compute approximate reciprocal sqrt.
4509 // Does not correctly handle +/- 0 or +inf
4510 float32x4_t result = vrsqrteq_f32(a);
4511 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4512 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4513 return result;
4514}
4515
4516EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
4517 // Compute approximate reciprocal sqrt.
4518 // Does not correctly handle +/- 0 or +inf
4519 float32x2_t result = vrsqrte_f32(a);
4520 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4521 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4522 return result;
4523}
4524
4525template <typename Packet>
4526Packet prsqrt_float_common(const Packet& a) {
4527 const Packet cst_zero = pzero(a);
4528 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4529 Packet return_zero = pcmp_eq(a, cst_inf);
4530 Packet return_inf = pcmp_eq(a, cst_zero);
4531 Packet result = prsqrt_float_unsafe(a);
4532 result = pselect(return_inf, por(cst_inf, a), result);
4533 result = pandnot(result, return_zero);
4534 return result;
4535}
4536
4537template <>
4538EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
4539 return prsqrt_float_common(a);
4540}
4541
4542template <>
4543EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
4544 return prsqrt_float_common(a);
4545}
4546
4547template <>
4548EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
4549 // Compute approximate reciprocal.
4550 float32x4_t result = vrecpeq_f32(a);
4551 result = vmulq_f32(vrecpsq_f32(a, result), result);
4552 result = vmulq_f32(vrecpsq_f32(a, result), result);
4553 return result;
4554}
4555
4556template <>
4557EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) {
4558 // Compute approximate reciprocal.
4559 float32x2_t result = vrecpe_f32(a);
4560 result = vmul_f32(vrecps_f32(a, result), result);
4561 result = vmul_f32(vrecps_f32(a, result), result);
4562 return result;
4563}
4564
4565// Unfortunately vsqrt_f32 is only available for A64.
4566#if EIGEN_ARCH_ARM64
4567template <>
4568EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
4569 return vsqrtq_f32(a);
4570}
4571
4572template <>
4573EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
4574 return vsqrt_f32(a);
4575}
4576
4577template <>
4578EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) {
4579 return vdivq_f32(a, b);
4580}
4581
4582template <>
4583EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) {
4584 return vdiv_f32(a, b);
4585}
4586#else
4587template <typename Packet>
4588EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
4589 const Packet cst_zero = pzero(a);
4590 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4591
4592 Packet result = pmul(a, prsqrt_float_unsafe(a));
4593 Packet a_is_zero = pcmp_eq(a, cst_zero);
4594 Packet a_is_inf = pcmp_eq(a, cst_inf);
4595 Packet return_a = por(a_is_zero, a_is_inf);
4596
4597 result = pselect(return_a, a, result);
4598 return result;
4599}
4600
4601template <>
4602EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
4603 return psqrt_float_common(a);
4604}
4605
4606template <>
4607EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
4608 return psqrt_float_common(a);
4609}
4610
4611template <typename Packet>
4612EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
4613 // if b is large, NEON intrinsics will flush preciprocal(b) to zero
4614 // avoid underflow with the following manipulation:
4615 // a / b = f * (a * reciprocal(f * b))
4616
4617 const Packet cst_one = pset1<Packet>(1.0f);
4618 const Packet cst_quarter = pset1<Packet>(0.25f);
4619 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
4620
4621 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
4622 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
4623 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
4624 return result;
4625}
4626
4627template <>
4628EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
4629 return pdiv_float_common(a, b);
4630}
4631
4632template <>
4633EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
4634 return pdiv_float_common(a, b);
4635}
4636#endif
4637
4638//---------- bfloat16 ----------
4639// TODO: Add support for native armv8.6-a bfloat16_t
4640
4641// TODO: Guard if we have native bfloat16 support
4642typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
4643
4644template <>
4645struct is_arithmetic<Packet4bf> {
4646 enum { value = true };
4647};
4648
4649template <>
4650struct packet_traits<bfloat16> : default_packet_traits {
4651 typedef Packet4bf type;
4652 typedef Packet4bf half;
4653 enum {
4654 Vectorizable = 1,
4655 AlignedOnScalar = 1,
4656 size = 4,
4657
4658 HasCmp = 1,
4659 HasAdd = 1,
4660 HasSub = 1,
4661 HasShift = 1,
4662 HasMul = 1,
4663 HasNegate = 1,
4664 HasAbs = 1,
4665 HasArg = 0,
4666 HasAbs2 = 1,
4667 HasAbsDiff = 1,
4668 HasMin = 1,
4669 HasMax = 1,
4670 HasConj = 1,
4671 HasSetLinear = 1,
4672 HasBlend = 0,
4673 HasDiv = 1,
4674 HasSin = EIGEN_FAST_MATH,
4675 HasCos = EIGEN_FAST_MATH,
4676 HasLog = 1,
4677 HasExp = 1,
4678 HasSqrt = 0,
4679 HasTanh = EIGEN_FAST_MATH,
4680 HasErf = EIGEN_FAST_MATH,
4681 HasBessel = 0, // Issues with accuracy.
4682 HasNdtri = 0
4683 };
4684};
4685
4686template <>
4687struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
4688
4689namespace detail {
4690template <>
4691EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
4692 const uint16x4x2_t tmp = vzip_u16(p1, p2);
4693 p1 = tmp.val[0];
4694 p2 = tmp.val[1];
4695}
4696} // namespace detail
4697
4698EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) {
4699 // See the scalar implementation in BFloat16.h for a comprehensible explanation
4700 // of this fast rounding algorithm
4701 Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
4702
4703 // lsb = (input >> 16) & 1
4704 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
4705
4706 // rounding_bias = 0x7fff + lsb
4707 Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
4708
4709 // input += rounding_bias
4710 input = vaddq_u32(input, rounding_bias);
4711
4712 // input = input >> 16
4713 input = vshrq_n_u32(input, 16);
4714
4715 // Replace float-nans by bfloat16-nans, that is 0x7fc0
4716 const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
4717 const Packet4ui mask = vceqq_f32(p, p);
4718 input = vbslq_u32(mask, input, bf16_nan);
4719
4720 // output = static_cast<uint16_t>(input)
4721 return vmovn_u32(input);
4722}
4723
4724EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) {
4725 return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
4726}
4727
4728EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
4729
4730template <>
4731EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
4732 return Packet4bf(pset1<Packet4us>(from.value));
4733}
4734
4735template <>
4736EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
4737 return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
4738}
4739
4740template <>
4741EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
4742 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4bf>::alignment);
4743 return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4744}
4745
4746template <>
4747EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
4748 return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4749}
4750
4751template <>
4752EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
4753 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4bf>::alignment);
4754 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4755}
4756
4757template <>
4758EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from) {
4759 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4760}
4761
4762template <>
4763EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from) {
4764 return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4765}
4766
4767template <>
4768EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
4769 return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
4770}
4771
4772template <>
4773EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4774 return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4775}
4776template <>
4777EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4778 return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4779}
4780
4781template <>
4782EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4783 return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4784}
4785
4786template <>
4787EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4788 return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4789}
4790template <>
4791EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4792 return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4793}
4794
4795template <>
4796EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4797 return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4798}
4799
4800template <>
4801EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a) {
4802 return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
4803}
4804
4805template <>
4806EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a, const Packet4bf& b) {
4807 return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
4808}
4809
4810template <>
4811EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a, const Packet4bf& b) {
4812 return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
4813}
4814
4815template <>
4816EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a, const Packet4bf& b) {
4817 return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
4818}
4819
4820template <>
4821EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a, const Packet4bf& b) {
4822 return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
4823}
4824
4825template <>
4826EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) {
4827 return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
4828}
4829
4830template <>
4831EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a) {
4832 return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
4833}
4834
4835template <>
4836EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a) {
4837 return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
4838}
4839
4840template <>
4841EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
4842 return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
4843}
4844
4845template <>
4846EIGEN_STRONG_INLINE Packet4bf pround<Packet4bf>(const Packet4bf& a) {
4847 return F32ToBf16(pround<Packet4f>(Bf16ToF32(a)));
4848}
4849
4850template <>
4851EIGEN_STRONG_INLINE Packet4bf ptrunc<Packet4bf>(const Packet4bf& a) {
4852 return F32ToBf16(ptrunc<Packet4f>(Bf16ToF32(a)));
4853}
4854
4855template <>
4856EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
4857 return a;
4858}
4859
4860template <>
4861EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4862 return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4863}
4864
4865template <>
4866EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4867 return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4868}
4869
4870template <>
4871EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4872 return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4873}
4874
4875template <>
4876EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4877 return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4878}
4879
4880template <>
4881EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4882 return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4883}
4884
4885template <>
4886EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4887 return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4888}
4889
4890template <>
4891EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4892 return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4893}
4894
4895template <>
4896EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4897 return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4898}
4899
4900template <>
4901EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride) {
4902 return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
4903}
4904
4905template <>
4906EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride) {
4907 pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
4908}
4909
4910template <>
4911EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a) {
4912 return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
4913}
4914
4915template <>
4916EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a) {
4917 return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
4918}
4919
4920template <>
4921EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a) {
4922 return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
4923}
4924
4925template <>
4926EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a) {
4927 return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
4928}
4929
4930template <>
4931EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a) {
4932 return Packet4bf(preverse<Packet4us>(Packet4us(a)));
4933}
4934
4935EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
4936 detail::ptranspose_impl(kernel);
4937}
4938
4939template <>
4940EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4941 return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4942}
4943
4944template <>
4945EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4946 return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4947}
4948
4949template <>
4950EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4951 return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4952}
4953
4954template <>
4955EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4956 return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4957}
4958
4959template <>
4960EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4961 return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4962}
4963
4964template <>
4965EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
4966 return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
4967}
4968
4969//---------- double ----------
4970
4971// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrinsics for double.
4972// Confirmed at least with __apple_build_version__ = 6000054.
4973#if EIGEN_COMP_CLANGAPPLE
4974// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
4975// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
4976// major toolchain updates.
4977#define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
4978#else
4979#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
4980#endif
4981
4982#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
4983
4984#if EIGEN_COMP_GNUC
4985// Bug 907: workaround missing declarations of the following two functions in the ADK
4986// Defining these functions as templates ensures that if these intrinsics are
4987// already defined in arm_neon.h, then our workaround doesn't cause a conflict
4988// and has lower priority in overload resolution.
4989// This doesn't work with MSVC though, since the function names are macros.
4990template <typename T>
4991uint64x2_t vreinterpretq_u64_f64(T a) {
4992 return (uint64x2_t)a;
4993}
4994
4995template <typename T>
4996float64x2_t vreinterpretq_f64_u64(T a) {
4997 return (float64x2_t)a;
4998}
4999#endif
5000
5001#if EIGEN_COMP_MSVC_STRICT
5002typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
5003typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
5004
5005EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
5006 double from[2] = {a, b};
5007 return vld1q_f64(from);
5008}
5009
5010#else
5011typedef float64x2_t Packet2d;
5012typedef float64x1_t Packet1d;
5013
5014EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
5015#endif
5016
5017// functionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
5018// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
5019// for fast inversion of matrices of size 4.
5020EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
5021 const double* a = reinterpret_cast<const double*>(&m);
5022 const double* b = reinterpret_cast<const double*>(&n);
5023 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
5024 return res;
5025}
5026
5027EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
5028 return shuffle(a, b, mask);
5029}
5030EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
5031EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
5032#define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
5033
5034template <>
5035struct packet_traits<double> : default_packet_traits {
5036 typedef Packet2d type;
5037 typedef Packet2d half;
5038 enum {
5039 Vectorizable = 1,
5040 AlignedOnScalar = 1,
5041 size = 2,
5042
5043 HasCmp = 1,
5044 HasAdd = 1,
5045 HasSub = 1,
5046 HasShift = 1,
5047 HasMul = 1,
5048 HasNegate = 1,
5049 HasAbs = 1,
5050 HasArg = 0,
5051 HasAbs2 = 1,
5052 HasAbsDiff = 1,
5053 HasMin = 1,
5054 HasMax = 1,
5055 HasConj = 1,
5056 HasSetLinear = 1,
5057 HasBlend = 0,
5058
5059 HasDiv = 1,
5060
5061#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5062 HasExp = 1,
5063 HasLog = 1,
5064 HasPow = 1,
5065 HasATan = 1,
5066 HasATanh = 1,
5067#endif
5068 HasSin = EIGEN_FAST_MATH,
5069 HasCos = EIGEN_FAST_MATH,
5070 HasSqrt = 1,
5071 HasRsqrt = 1,
5072 HasCbrt = 1,
5073 HasTanh = EIGEN_FAST_MATH,
5074 HasErf = EIGEN_FAST_MATH,
5075 HasErfc = EIGEN_FAST_MATH
5076 };
5077};
5078
5079template <>
5080struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
5081 using integer_packet = Packet2l;
5082};
5083
5084template <>
5085EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(const Packet2d& /*a*/) {
5086 return vdupq_n_f64(0.0);
5087}
5088
5089template <>
5090EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
5091 return vdupq_n_f64(from);
5092}
5093
5094template <>
5095EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
5096 const double c[] = {0.0, 1.0};
5097 return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
5098}
5099
5100template <>
5101EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
5102 return vaddq_f64(a, b);
5103}
5104
5105template <>
5106EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
5107 return vsubq_f64(a, b);
5108}
5109
5110template <>
5111EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
5112template <>
5113EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
5114 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
5115 return padd(a, pxor(mask, b));
5116}
5117
5118template <>
5119EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
5120 return vnegq_f64(a);
5121}
5122
5123template <>
5124EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
5125 return a;
5126}
5127
5128template <>
5129EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
5130 return vmulq_f64(a, b);
5131}
5132
5133template <>
5134EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
5135 return vdivq_f64(a, b);
5136}
5137
5138#ifdef EIGEN_VECTORIZE_FMA
5139// See bug 936. See above comment about FMA for float.
5140template <>
5141EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5142 return vfmaq_f64(c, a, b);
5143}
5144template <>
5145EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5146 return vfmsq_f64(c, a, b);
5147}
5148#else
5149template <>
5150EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5151 return vmlaq_f64(c, a, b);
5152}
5153template <>
5154EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5155 return vmlsq_f64(c, a, b);
5156}
5157#endif
5158template <>
5159EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5160 return pnegate(pnmadd(a, b, c));
5161}
5162template <>
5163EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5164 return pnegate(pmadd(a, b, c));
5165}
5166template <>
5167EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
5168 return vminq_f64(a, b);
5169}
5170
5171#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5172// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5173// systems).
5174template <>
5175EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
5176 return vminnmq_f64(a, b);
5177}
5178template <>
5179EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
5180 return vmaxnmq_f64(a, b);
5181}
5182
5183#endif
5184
5185template <>
5186EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
5187 return pmin<Packet2d>(a, b);
5188}
5189
5190template <>
5191EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
5192 return vmaxq_f64(a, b);
5193}
5194
5195template <>
5196EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
5197 return pmax<Packet2d>(a, b);
5198}
5199
5200// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
5201template <>
5202EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
5203 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5204}
5205
5206template <>
5207EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
5208 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5209}
5210
5211template <>
5212EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
5213 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5214}
5215
5216template <>
5217EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
5218 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5219}
5220
5221template <>
5222EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
5223 return vreinterpretq_f64_u64(vcleq_f64(a, b));
5224}
5225
5226template <>
5227EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
5228 return vreinterpretq_f64_u64(vcltq_f64(a, b));
5229}
5230
5231template <>
5232EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
5233 return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
5234}
5235
5236template <>
5237EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
5238 return vreinterpretq_f64_u64(vceqq_f64(a, b));
5239}
5240
5241template <>
5242EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
5243 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet2d>::alignment);
5244 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
5245}
5246
5247template <>
5248EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
5249 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
5250}
5251
5252template <>
5253EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
5254 return vld1q_dup_f64(from);
5255}
5256template <>
5257EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
5258 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet2d>::alignment);
5259 EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
5260}
5261
5262template <>
5263EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
5264 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
5265}
5266
5267template <>
5268EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
5269 Packet2d res = pset1<Packet2d>(0.0);
5270 res = vld1q_lane_f64(from + 0 * stride, res, 0);
5271 res = vld1q_lane_f64(from + 1 * stride, res, 1);
5272 return res;
5273}
5274
5275template <>
5276EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
5277 vst1q_lane_f64(to + stride * 0, from, 0);
5278 vst1q_lane_f64(to + stride * 1, from, 1);
5279}
5280
5281template <>
5282EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
5283 EIGEN_ARM_PREFETCH(addr);
5284}
5285
5286// FIXME only store the 2 first elements ?
5287template <>
5288EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
5289 return vgetq_lane_f64(a, 0);
5290}
5291
5292template <>
5293EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
5294 return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
5295}
5296
5297template <>
5298EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
5299 return vabsq_f64(a);
5300}
5301
5302template <>
5303EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
5304 return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
5305}
5306
5307template <>
5308EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
5309 return vaddvq_f64(a);
5310}
5311
5312// Other reduction functions:
5313// mul
5314#if EIGEN_COMP_CLANGAPPLE
5315template <>
5316EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
5317 return (vget_low_f64(a) * vget_high_f64(a))[0];
5318}
5319#else
5320template <>
5321EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
5322 return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
5323}
5324#endif
5325
5326// min
5327template <>
5328EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
5329 return vminvq_f64(a);
5330}
5331
5332// max
5333template <>
5334EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
5335 return vmaxvq_f64(a);
5336}
5337
5338EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
5339 const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
5340 const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
5341
5342 kernel.packet[0] = tmp1;
5343 kernel.packet[1] = tmp2;
5344}
5345
5346template <>
5347EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
5348 return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
5349}
5350
5351template <>
5352EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
5353 return vrndnq_f64(a);
5354}
5355
5356template <>
5357EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
5358 return vrndmq_f64(a);
5359}
5360
5361template <>
5362EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
5363 return vrndpq_f64(a);
5364}
5365
5366template <>
5367EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
5368 return vrndaq_f64(a);
5369}
5370
5371template <>
5372EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
5373 return vrndq_f64(a);
5374}
5375
5376template <>
5377EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
5378 return pldexp_generic(a, exponent);
5379}
5380
5381template <>
5382EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
5383 return pfrexp_generic(a, exponent);
5384}
5385
5386template <>
5387EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
5388 return vreinterpretq_f64_u64(vdupq_n_u64(from));
5389}
5390
5391template <>
5392EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
5393 // Do Newton iterations for 1/sqrt(x).
5394 return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
5395}
5396
5397template <>
5398EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x) {
5399 return vsqrtq_f64(_x);
5400}
5401
5402#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5403
5404// Do we have an fp16 types and supporting Neon intrinsics?
5405#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
5406typedef float16x4_t Packet4hf;
5407typedef float16x8_t Packet8hf;
5408
5409template <>
5410struct packet_traits<Eigen::half> : default_packet_traits {
5411 typedef Packet8hf type;
5412 typedef Packet4hf half;
5413 enum {
5414 Vectorizable = 1,
5415 AlignedOnScalar = 1,
5416 size = 8,
5417
5418 HasCmp = 1,
5419 HasCast = 1,
5420 HasAdd = 1,
5421 HasSub = 1,
5422 HasShift = 1,
5423 HasMul = 1,
5424 HasNegate = 1,
5425 HasAbs = 1,
5426 HasArg = 0,
5427 HasAbs2 = 1,
5428 HasAbsDiff = 0,
5429 HasMin = 1,
5430 HasMax = 1,
5431 HasConj = 1,
5432 HasSetLinear = 1,
5433 HasBlend = 0,
5434 HasInsert = 1,
5435 HasReduxp = 1,
5436 HasDiv = 1,
5437 HasSin = 0,
5438 HasCos = 0,
5439 HasLog = 0,
5440 HasExp = 0,
5441 HasTanh = packet_traits<float>::HasTanh, // tanh<half> calls tanh<float>
5442 HasSqrt = 1,
5443 HasRsqrt = 1,
5444 HasErf = EIGEN_FAST_MATH,
5445 HasBessel = 0, // Issues with accuracy.
5446 HasNdtri = 0
5447 };
5448};
5449
5450template <>
5451struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
5452template <>
5453struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
5454 using half = Packet4hf;
5455};
5456
5457template <>
5458EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
5459 return vadd_f16(vget_low_f16(a), vget_high_f16(a));
5460}
5461
5462template <>
5463EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
5464 return vdupq_n_f16(from.x);
5465}
5466
5467template <>
5468EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
5469 return vdup_n_f16(from.x);
5470}
5471
5472template <>
5473EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
5474 const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
5475 Packet8hf countdown = vld1q_f16(f);
5476 return vaddq_f16(pset1<Packet8hf>(a), countdown);
5477}
5478
5479template <>
5480EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
5481 const float16_t f[] = {0, 1, 2, 3};
5482 Packet4hf countdown = vld1_f16(f);
5483 return vadd_f16(pset1<Packet4hf>(a), countdown);
5484}
5485
5486template <>
5487EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5488 return vaddq_f16(a, b);
5489}
5490
5491template <>
5492EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5493 return vadd_f16(a, b);
5494}
5495
5496template <>
5497EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5498 return vsubq_f16(a, b);
5499}
5500
5501template <>
5502EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5503 return vsub_f16(a, b);
5504}
5505
5506template <>
5507EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
5508 return vnegq_f16(a);
5509}
5510
5511template <>
5512EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
5513 return vneg_f16(a);
5514}
5515
5516template <>
5517EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
5518 return a;
5519}
5520
5521template <>
5522EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
5523 return a;
5524}
5525
5526template <>
5527EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5528 return vmulq_f16(a, b);
5529}
5530
5531template <>
5532EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5533 return vmul_f16(a, b);
5534}
5535
5536template <>
5537EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5538 return vdivq_f16(a, b);
5539}
5540
5541template <>
5542EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5543 return vdiv_f16(a, b);
5544}
5545
5546template <>
5547EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5548 return vfmaq_f16(c, a, b);
5549}
5550
5551template <>
5552EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5553 return vfma_f16(c, a, b);
5554}
5555
5556template <>
5557EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5558 return vfmsq_f16(c, a, b);
5559}
5560
5561template <>
5562EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5563 return vfms_f16(c, a, b);
5564}
5565
5566template <>
5567EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5568 return pnegate(pnmadd(a, b, c));
5569}
5570
5571template <>
5572EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5573 return pnegate(pnmadd(a, b, c));
5574}
5575
5576template <>
5577EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5578 return pnegate(pmadd(a, b, c));
5579}
5580
5581template <>
5582EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5583 return pnegate(pmadd(a, b, c));
5584}
5585
5586template <>
5587EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5588 return vminq_f16(a, b);
5589}
5590
5591template <>
5592EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5593 return vmin_f16(a, b);
5594}
5595
5596#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5597// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5598// systems).
5599template <>
5600EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5601 return vminnm_f16(a, b);
5602}
5603template <>
5604EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5605 return vminnmq_f16(a, b);
5606}
5607#endif
5608
5609template <>
5610EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5611 return pmin<Packet4hf>(a, b);
5612}
5613
5614template <>
5615EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5616 return pmin<Packet8hf>(a, b);
5617}
5618
5619template <>
5620EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5621 return vmaxq_f16(a, b);
5622}
5623
5624template <>
5625EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5626 return vmax_f16(a, b);
5627}
5628
5629#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5630// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5631// systems).
5632template <>
5633EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5634 return vmaxnm_f16(a, b);
5635}
5636template <>
5637EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5638 return vmaxnmq_f16(a, b);
5639}
5640#endif
5641
5642template <>
5643EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5644 return pmax<Packet4hf>(a, b);
5645}
5646
5647template <>
5648EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5649 return pmax<Packet8hf>(a, b);
5650}
5651
5652#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
5653 template <> \
5654 EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
5655 return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
5656 }
5657
5658#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
5659 template <> \
5660 EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
5661 return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
5662 }
5663
5664EIGEN_MAKE_ARM_FP16_CMP_8(eq)
5665EIGEN_MAKE_ARM_FP16_CMP_8(lt)
5666EIGEN_MAKE_ARM_FP16_CMP_8(le)
5667
5668EIGEN_MAKE_ARM_FP16_CMP_4(eq)
5669EIGEN_MAKE_ARM_FP16_CMP_4(lt)
5670EIGEN_MAKE_ARM_FP16_CMP_4(le)
5671
5672#undef EIGEN_MAKE_ARM_FP16_CMP_8
5673#undef EIGEN_MAKE_ARM_FP16_CMP_4
5674
5675template <>
5676EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5677 return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
5678}
5679
5680template <>
5681EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5682 return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
5683}
5684
5685template <>
5686EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
5687 return vrndnq_f16(a);
5688}
5689
5690template <>
5691EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
5692 return vrndn_f16(a);
5693}
5694
5695template <>
5696EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
5697 return vrndmq_f16(a);
5698}
5699
5700template <>
5701EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
5702 return vrndm_f16(a);
5703}
5704
5705template <>
5706EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
5707 return vrndpq_f16(a);
5708}
5709
5710template <>
5711EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
5712 return vrndp_f16(a);
5713}
5714
5715template <>
5716EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(const Packet8hf& a) {
5717 return vrndaq_f16(a);
5718}
5719
5720template <>
5721EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(const Packet4hf& a) {
5722 return vrnda_f16(a);
5723}
5724
5725template <>
5726EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(const Packet8hf& a) {
5727 return vrndq_f16(a);
5728}
5729
5730template <>
5731EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(const Packet4hf& a) {
5732 return vrnd_f16(a);
5733}
5734
5735template <>
5736EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
5737 return vsqrtq_f16(a);
5738}
5739
5740template <>
5741EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
5742 return vsqrt_f16(a);
5743}
5744
5745template <>
5746EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5747 return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5748}
5749
5750template <>
5751EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5752 return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5753}
5754
5755template <>
5756EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5757 return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5758}
5759
5760template <>
5761EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5762 return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5763}
5764
5765template <>
5766EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5767 return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5768}
5769
5770template <>
5771EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5772 return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5773}
5774
5775template <>
5776EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5777 return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5778}
5779
5780template <>
5781EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5782 return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5783}
5784
5785template <>
5786EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
5787 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet8hf>::alignment);
5788 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5789}
5790
5791template <>
5792EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
5793 EIGEN_ASSUME_ALIGNED(from, unpacket_traits<Packet4hf>::alignment);
5794 EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
5795}
5796
5797template <>
5798EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
5799 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5800}
5801
5802template <>
5803EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
5804 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
5805}
5806
5807template <>
5808EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
5809 Packet8hf packet;
5810 packet[0] = from[0].x;
5811 packet[1] = from[0].x;
5812 packet[2] = from[1].x;
5813 packet[3] = from[1].x;
5814 packet[4] = from[2].x;
5815 packet[5] = from[2].x;
5816 packet[6] = from[3].x;
5817 packet[7] = from[3].x;
5818 return packet;
5819}
5820
5821template <>
5822EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
5823 float16x4_t packet;
5824 float16_t* tmp;
5825 tmp = (float16_t*)&packet;
5826 tmp[0] = from[0].x;
5827 tmp[1] = from[0].x;
5828 tmp[2] = from[1].x;
5829 tmp[3] = from[1].x;
5830 return packet;
5831}
5832
5833template <>
5834EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
5835 Packet4hf lo, hi;
5836 lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
5837 hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
5838 return vcombine_f16(lo, hi);
5839}
5840
5841EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
5842 return vsetq_lane_f16(b.x, a, 0);
5843}
5844
5845EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
5846 return vset_lane_f16(b.x, a, 0);
5847}
5848
5849template <>
5850EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
5851 return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
5852}
5853
5854template <>
5855EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
5856 return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
5857}
5858
5859EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
5860 return vsetq_lane_f16(b.x, a, 7);
5861}
5862
5863EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
5864 return vset_lane_f16(b.x, a, 3);
5865}
5866
5867template <>
5868EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5869 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet8hf>::alignment);
5870 EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5871}
5872
5873template <>
5874EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5875 EIGEN_ASSUME_ALIGNED(to, unpacket_traits<Packet4hf>::alignment);
5876 EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5877}
5878
5879template <>
5880EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5881 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5882}
5883
5884template <>
5885EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5886 EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5887}
5888
5889template <>
5890EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
5891 Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
5892 res = vsetq_lane_f16(from[0 * stride].x, res, 0);
5893 res = vsetq_lane_f16(from[1 * stride].x, res, 1);
5894 res = vsetq_lane_f16(from[2 * stride].x, res, 2);
5895 res = vsetq_lane_f16(from[3 * stride].x, res, 3);
5896 res = vsetq_lane_f16(from[4 * stride].x, res, 4);
5897 res = vsetq_lane_f16(from[5 * stride].x, res, 5);
5898 res = vsetq_lane_f16(from[6 * stride].x, res, 6);
5899 res = vsetq_lane_f16(from[7 * stride].x, res, 7);
5900 return res;
5901}
5902
5903template <>
5904EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
5905 Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
5906 res = vset_lane_f16(from[0 * stride].x, res, 0);
5907 res = vset_lane_f16(from[1 * stride].x, res, 1);
5908 res = vset_lane_f16(from[2 * stride].x, res, 2);
5909 res = vset_lane_f16(from[3 * stride].x, res, 3);
5910 return res;
5911}
5912
5913template <>
5914EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
5915 Index stride) {
5916 to[stride * 0].x = vgetq_lane_f16(from, 0);
5917 to[stride * 1].x = vgetq_lane_f16(from, 1);
5918 to[stride * 2].x = vgetq_lane_f16(from, 2);
5919 to[stride * 3].x = vgetq_lane_f16(from, 3);
5920 to[stride * 4].x = vgetq_lane_f16(from, 4);
5921 to[stride * 5].x = vgetq_lane_f16(from, 5);
5922 to[stride * 6].x = vgetq_lane_f16(from, 6);
5923 to[stride * 7].x = vgetq_lane_f16(from, 7);
5924}
5925
5926template <>
5927EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
5928 Index stride) {
5929 to[stride * 0].x = vget_lane_f16(from, 0);
5930 to[stride * 1].x = vget_lane_f16(from, 1);
5931 to[stride * 2].x = vget_lane_f16(from, 2);
5932 to[stride * 3].x = vget_lane_f16(from, 3);
5933}
5934
5935template <>
5936EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
5937 EIGEN_ARM_PREFETCH(addr);
5938}
5939
5940template <>
5941EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
5942 float16_t x[8];
5943 vst1q_f16(x, a);
5944 Eigen::half h;
5945 h.x = x[0];
5946 return h;
5947}
5948
5949template <>
5950EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
5951 float16_t x[4];
5952 vst1_f16(x, a);
5953 Eigen::half h;
5954 h.x = x[0];
5955 return h;
5956}
5957
5958template <>
5959EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
5960 float16x4_t a_lo, a_hi;
5961 Packet8hf a_r64;
5962
5963 a_r64 = vrev64q_f16(a);
5964 a_lo = vget_low_f16(a_r64);
5965 a_hi = vget_high_f16(a_r64);
5966 return vcombine_f16(a_hi, a_lo);
5967}
5968
5969template <>
5970EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
5971 return vrev64_f16(a);
5972}
5973
5974template <>
5975EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
5976 return vabsq_f16(a);
5977}
5978
5979template <>
5980EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
5981 return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
5982}
5983
5984template <>
5985EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
5986 return vabs_f16(a);
5987}
5988
5989template <>
5990EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
5991 return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
5992}
5993
5994template <>
5995EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
5996 float16x4_t a_lo, a_hi, sum;
5997
5998 a_lo = vget_low_f16(a);
5999 a_hi = vget_high_f16(a);
6000 sum = vpadd_f16(a_lo, a_hi);
6001 sum = vpadd_f16(sum, sum);
6002 sum = vpadd_f16(sum, sum);
6003
6004 Eigen::half h;
6005 h.x = vget_lane_f16(sum, 0);
6006 return h;
6007}
6008
6009template <>
6010EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
6011 float16x4_t sum;
6012
6013 sum = vpadd_f16(a, a);
6014 sum = vpadd_f16(sum, sum);
6015 Eigen::half h;
6016 h.x = vget_lane_f16(sum, 0);
6017 return h;
6018}
6019
6020template <>
6021EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
6022 float16x4_t a_lo, a_hi, prod;
6023
6024 a_lo = vget_low_f16(a);
6025 a_hi = vget_high_f16(a);
6026 prod = vmul_f16(a_lo, a_hi);
6027 prod = vmul_f16(prod, vrev64_f16(prod));
6028
6029 Eigen::half h;
6030 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6031 return h;
6032}
6033
6034template <>
6035EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
6036 float16x4_t prod;
6037 prod = vmul_f16(a, vrev64_f16(a));
6038 Eigen::half h;
6039 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
6040 return h;
6041}
6042
6043template <>
6044EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
6045 Eigen::half h;
6046 h.x = vminvq_f16(a);
6047 return h;
6048}
6049
6050template <>
6051EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
6052 Eigen::half h;
6053 h.x = vminv_f16(a);
6054 return h;
6055}
6056
6057template <>
6058EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
6059 Eigen::half h;
6060 h.x = vmaxvq_f16(a);
6061 return h;
6062}
6063
6064template <>
6065EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
6066 Eigen::half h;
6067 h.x = vmaxv_f16(a);
6068 return h;
6069}
6070
6071EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
6072 const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
6073 const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
6074
6075 const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
6076 const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
6077
6078 kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
6079 kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
6080 kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
6081 kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
6082}
6083
6084EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
6085 EIGEN_ALIGN16 float16x4x4_t tmp_x4;
6086 float16_t* tmp = (float16_t*)&kernel;
6087 tmp_x4 = vld4_f16(tmp);
6088
6089 kernel.packet[0] = tmp_x4.val[0];
6090 kernel.packet[1] = tmp_x4.val[1];
6091 kernel.packet[2] = tmp_x4.val[2];
6092 kernel.packet[3] = tmp_x4.val[3];
6093}
6094
6095EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
6096 float16x8x2_t T_1[4];
6097
6098 T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
6099 T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
6100 T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
6101 T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
6102
6103 float16x8x2_t T_2[4];
6104 T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
6105 T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
6106 T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
6107 T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
6108
6109 float16x8x2_t T_3[4];
6110 T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
6111 T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
6112 T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
6113 T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
6114
6115 kernel.packet[0] = T_3[0].val[0];
6116 kernel.packet[1] = T_3[2].val[0];
6117 kernel.packet[2] = T_3[1].val[0];
6118 kernel.packet[3] = T_3[3].val[0];
6119 kernel.packet[4] = T_3[0].val[1];
6120 kernel.packet[5] = T_3[2].val[1];
6121 kernel.packet[6] = T_3[1].val[1];
6122 kernel.packet[7] = T_3[3].val[1];
6123}
6124#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
6125
6126} // end namespace internal
6127
6128} // end namespace Eigen
6129
6130#endif // EIGEN_PACKET_MATH_NEON_H
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82