Eigen  5.0.1-dev+7c7d8473
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
6// Heavily based on Gael's SSE version.
7//
8// This Source Code Form is subject to the terms of the Mozilla
9// Public License v. 2.0. If a copy of the MPL was not distributed
10// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11
12#ifndef EIGEN_PACKET_MATH_NEON_H
13#define EIGEN_PACKET_MATH_NEON_H
14
15// IWYU pragma: private
16#include "../../InternalHeaderCheck.h"
17
18namespace Eigen {
19
20namespace internal {
21
22#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
23#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
24#endif
25
26#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
27#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28#endif
29
30#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
31#if EIGEN_ARCH_ARM64
32#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33#else
34#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
35#endif
36#endif
37
38#if EIGEN_COMP_MSVC_STRICT
39
40// In MSVC's arm_neon.h header file, all NEON vector types
41// are aliases to the same underlying type __n128.
42// We thus have to wrap them to make them different C++ types.
43// (See also bug 1428)
44typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
45typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
46typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
47typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
48typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
49typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
50typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
51typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
52typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
53typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
54typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
55typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
56typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
57typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
58typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
59typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
60typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
61typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
62
63EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
64 float from[4] = {a, b, c, d};
65 return vld1q_f32(from);
66}
67
68EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
69 float from[2] = {a, b};
70 return vld1_f32(from);
71}
72
73#else
74
75typedef float32x2_t Packet2f;
76typedef float32x4_t Packet4f;
77typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
78typedef int8x8_t Packet8c;
79typedef int8x16_t Packet16c;
80typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
81typedef uint8x8_t Packet8uc;
82typedef uint8x16_t Packet16uc;
83typedef int16x4_t Packet4s;
84typedef int16x8_t Packet8s;
85typedef uint16x4_t Packet4us;
86typedef uint16x8_t Packet8us;
87typedef int32x2_t Packet2i;
88typedef int32x4_t Packet4i;
89typedef uint32x2_t Packet2ui;
90typedef uint32x4_t Packet4ui;
91typedef int64x2_t Packet2l;
92typedef uint64x2_t Packet2ul;
93
94EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
95EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
96
97#endif // EIGEN_COMP_MSVC_STRICT
98
99EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
100 const float* a = reinterpret_cast<const float*>(&m);
101 Packet4f res =
102 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
103 return res;
104}
105
106// functionally equivalent to _mm_shuffle_ps in SSE when interleave
107// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
108// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
109// to enable a shared implementation for fast inversion of matrices of size 4.
110template <bool interleave>
111EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
112 const float* a = reinterpret_cast<const float*>(&m);
113 const float* b = reinterpret_cast<const float*>(&n);
114 Packet4f res =
115 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
116 return res;
117}
118
119template <>
120EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
121 const float* a = reinterpret_cast<const float*>(&m);
122 const float* b = reinterpret_cast<const float*>(&n);
123 Packet4f res =
124 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
125 return res;
126}
127
128EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
129 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
130}
131
132EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
133 return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
134}
135EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
136 return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
137}
138EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
139 return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
140}
141EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
142 return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
143}
144EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
145 return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
146}
147EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
148 return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
149}
150#define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
151
152#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
153
154#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
155 const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
156
157#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
158
159#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
160// __builtin_prefetch tends to do nothing on ARM64 compilers because the
161// prefetch instructions there are too detailed for __builtin_prefetch to map
162// meaningfully to them.
163#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
164#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
165#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
166#elif defined __pld
167#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
168#elif EIGEN_ARCH_ARM
169#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
170#else
171// by default no explicit prefetching
172#define EIGEN_ARM_PREFETCH(ADDR)
173#endif
174
175template <>
176struct packet_traits<float> : default_packet_traits {
177 typedef Packet4f type;
178 typedef Packet2f half;
179 enum {
180 Vectorizable = 1,
181 AlignedOnScalar = 1,
182 size = 4,
183
184 HasCmp = 1,
185 HasAdd = 1,
186 HasSub = 1,
187 HasShift = 1,
188 HasMul = 1,
189 HasNegate = 1,
190 HasAbs = 1,
191 HasArg = 0,
192 HasAbsDiff = 1,
193 HasMin = 1,
194 HasMax = 1,
195 HasConj = 1,
196 HasSetLinear = 1,
197 HasDiv = 1,
198 HasSin = EIGEN_FAST_MATH,
199 HasCos = EIGEN_FAST_MATH,
200 HasACos = 1,
201 HasASin = 1,
202 HasATan = 1,
203 HasATanh = 1,
204 HasLog = 1,
205 HasExp = 1,
206 HasLog1p = 1,
207 HasExpm1 = 1,
208 HasPow = 1,
209 HasSqrt = 1,
210 HasRsqrt = 1,
211 HasCbrt = 1,
212 HasTanh = EIGEN_FAST_MATH,
213 HasErf = EIGEN_FAST_MATH,
214 HasErfc = EIGEN_FAST_MATH,
215 HasBessel = 0, // Issues with accuracy.
216 HasNdtri = 0
217 };
218};
219
220template <>
221struct packet_traits<int8_t> : default_packet_traits {
222 typedef Packet16c type;
223 typedef Packet8c half;
224 enum {
225 Vectorizable = 1,
226 AlignedOnScalar = 1,
227 size = 16,
228
229 HasCmp = 1,
230 HasAdd = 1,
231 HasSub = 1,
232 HasShift = 1,
233 HasMul = 1,
234 HasNegate = 1,
235 HasAbs = 1,
236 HasAbsDiff = 1,
237 HasArg = 0,
238 HasMin = 1,
239 HasMax = 1,
240 HasConj = 1,
241 HasSetLinear = 1,
242 };
243};
244
245template <>
246struct packet_traits<uint8_t> : default_packet_traits {
247 typedef Packet16uc type;
248 typedef Packet8uc half;
249 enum {
250 Vectorizable = 1,
251 AlignedOnScalar = 1,
252 size = 16,
253
254 HasCmp = 1,
255 HasAdd = 1,
256 HasSub = 1,
257 HasShift = 1,
258 HasMul = 1,
259 HasNegate = 0,
260 HasAbs = 1,
261 HasAbsDiff = 1,
262 HasArg = 0,
263 HasMin = 1,
264 HasMax = 1,
265 HasConj = 1,
266 HasSetLinear = 1,
267
268 HasSqrt = 1
269 };
270};
271
272template <>
273struct packet_traits<int16_t> : default_packet_traits {
274 typedef Packet8s type;
275 typedef Packet4s half;
276 enum {
277 Vectorizable = 1,
278 AlignedOnScalar = 1,
279 size = 8,
280
281 HasCmp = 1,
282 HasAdd = 1,
283 HasSub = 1,
284 HasShift = 1,
285 HasMul = 1,
286 HasNegate = 1,
287 HasAbs = 1,
288 HasAbsDiff = 1,
289 HasArg = 0,
290 HasMin = 1,
291 HasMax = 1,
292 HasConj = 1,
293 HasSetLinear = 1,
294 };
295};
296
297template <>
298struct packet_traits<uint16_t> : default_packet_traits {
299 typedef Packet8us type;
300 typedef Packet4us half;
301 enum {
302 Vectorizable = 1,
303 AlignedOnScalar = 1,
304 size = 8,
305
306 HasCmp = 1,
307 HasAdd = 1,
308 HasSub = 1,
309 HasShift = 1,
310 HasMul = 1,
311 HasNegate = 0,
312 HasAbs = 1,
313 HasAbsDiff = 1,
314 HasArg = 0,
315 HasMin = 1,
316 HasMax = 1,
317 HasConj = 1,
318 HasSetLinear = 1,
319 HasSqrt = 1
320 };
321};
322
323template <>
324struct packet_traits<int32_t> : default_packet_traits {
325 typedef Packet4i type;
326 typedef Packet2i half;
327 enum {
328 Vectorizable = 1,
329 AlignedOnScalar = 1,
330 size = 4,
331
332 HasCmp = 1,
333 HasAdd = 1,
334 HasSub = 1,
335 HasShift = 1,
336 HasMul = 1,
337 HasNegate = 1,
338 HasAbs = 1,
339 HasArg = 0,
340 HasAbsDiff = 1,
341 HasMin = 1,
342 HasMax = 1,
343 HasConj = 1,
344 HasSetLinear = 1,
345 };
346};
347
348template <>
349struct packet_traits<uint32_t> : default_packet_traits {
350 typedef Packet4ui type;
351 typedef Packet2ui half;
352 enum {
353 Vectorizable = 1,
354 AlignedOnScalar = 1,
355 size = 4,
356
357 HasCmp = 1,
358 HasAdd = 1,
359 HasSub = 1,
360 HasShift = 1,
361 HasMul = 1,
362 HasNegate = 0,
363 HasAbs = 1,
364 HasArg = 0,
365 HasAbsDiff = 1,
366 HasMin = 1,
367 HasMax = 1,
368 HasConj = 1,
369 HasSetLinear = 1,
370
371 HasSqrt = 1
372 };
373};
374
375template <>
376struct packet_traits<int64_t> : default_packet_traits {
377 typedef Packet2l type;
378 typedef Packet2l half;
379 enum {
380 Vectorizable = 1,
381 AlignedOnScalar = 1,
382 size = 2,
383
384 HasCmp = 1,
385 HasAdd = 1,
386 HasSub = 1,
387 HasShift = 1,
388 HasMul = 1,
389 HasNegate = 1,
390 HasAbs = 1,
391 HasArg = 0,
392 HasAbsDiff = 1,
393 HasMin = 1,
394 HasMax = 1,
395 HasConj = 1,
396 HasSetLinear = 1,
397 };
398};
399
400template <>
401struct packet_traits<uint64_t> : default_packet_traits {
402 typedef Packet2ul type;
403 typedef Packet2ul half;
404 enum {
405 Vectorizable = 1,
406 AlignedOnScalar = 1,
407 size = 2,
408
409 HasCmp = 1,
410 HasAdd = 1,
411 HasSub = 1,
412 HasShift = 1,
413 HasMul = 1,
414 HasNegate = 0,
415 HasAbs = 1,
416 HasArg = 0,
417 HasAbsDiff = 1,
418 HasMin = 1,
419 HasMax = 1,
420 HasConj = 1,
421 HasSetLinear = 1,
422 };
423};
424
425template <typename Packet, typename Scalar>
426struct neon_unpacket_default {
427 using type = Scalar;
428 using half = Packet;
429 static constexpr int size = sizeof(Packet) / sizeof(Scalar);
430 static constexpr int alignment = sizeof(Packet);
431 static constexpr bool vectorizable = true;
432 static constexpr bool masked_load_available = false;
433 static constexpr bool masked_store_available = false;
434};
435
436template <>
437struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
438 using integer_packet = Packet2i;
439};
440template <>
441struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
442 using half = Packet2f;
443 using integer_packet = Packet4i;
444};
445template <>
446struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
447template <>
448struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
449 using half = Packet4c;
450};
451template <>
452struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
453 using half = Packet8c;
454};
455template <>
456struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
457template <>
458struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
459 using half = Packet4uc;
460};
461template <>
462struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
463 using half = Packet8uc;
464};
465template <>
466struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
467template <>
468struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
469 using half = Packet4s;
470};
471template <>
472struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
473template <>
474struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
475 using half = Packet4us;
476};
477template <>
478struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
479template <>
480struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
481 using half = Packet2i;
482};
483template <>
484struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
485template <>
486struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
487 using half = Packet2ui;
488};
489template <>
490struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
491template <>
492struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
493
494template <>
495EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
496 return vdup_n_f32(0.0f);
497}
498
499template <>
500EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
501 return vdupq_n_f32(0.0f);
502}
503
504template <>
505EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
506 return vdup_n_f32(from);
507}
508template <>
509EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
510 return vdupq_n_f32(from);
511}
512template <>
513EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from) {
514 return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
515}
516template <>
517EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) {
518 return vdup_n_s8(from);
519}
520template <>
521EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
522 return vdupq_n_s8(from);
523}
524template <>
525EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from) {
526 return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
527}
528template <>
529EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) {
530 return vdup_n_u8(from);
531}
532template <>
533EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
534 return vdupq_n_u8(from);
535}
536template <>
537EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) {
538 return vdup_n_s16(from);
539}
540template <>
541EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
542 return vdupq_n_s16(from);
543}
544template <>
545EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) {
546 return vdup_n_u16(from);
547}
548template <>
549EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
550 return vdupq_n_u16(from);
551}
552template <>
553EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) {
554 return vdup_n_s32(from);
555}
556template <>
557EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
558 return vdupq_n_s32(from);
559}
560template <>
561EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) {
562 return vdup_n_u32(from);
563}
564template <>
565EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
566 return vdupq_n_u32(from);
567}
568template <>
569EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
570 return vdupq_n_s64(from);
571}
572template <>
573EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
574 return vdupq_n_u64(from);
575}
576
577template <>
578EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
579 return vreinterpret_f32_u32(vdup_n_u32(from));
580}
581template <>
582EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
583 return vreinterpretq_f32_u32(vdupq_n_u32(from));
584}
585
586template <>
587EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a) {
588 const float c[] = {0.0f, 1.0f};
589 return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
590}
591template <>
592EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
593 const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
594 return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
595}
596template <>
597EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a) {
598 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
599}
600template <>
601EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a) {
602 const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
603 return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
604}
605template <>
606EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
607 const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
608 return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
609}
610template <>
611EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a) {
612 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
613}
614template <>
615EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a) {
616 const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
617 return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
618}
619template <>
620EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
621 const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
622 return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
623}
624template <>
625EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a) {
626 const int16_t c[] = {0, 1, 2, 3};
627 return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
628}
629template <>
630EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a) {
631 const uint16_t c[] = {0, 1, 2, 3};
632 return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
633}
634template <>
635EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
636 const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
637 return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
638}
639template <>
640EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
641 const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
642 return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
643}
644template <>
645EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a) {
646 const int32_t c[] = {0, 1};
647 return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
648}
649template <>
650EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
651 const int32_t c[] = {0, 1, 2, 3};
652 return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
653}
654template <>
655EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a) {
656 const uint32_t c[] = {0, 1};
657 return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
658}
659template <>
660EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
661 const uint32_t c[] = {0, 1, 2, 3};
662 return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
663}
664template <>
665EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
666 const int64_t c[] = {0, 1};
667 return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
668}
669template <>
670EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
671 const uint64_t c[] = {0, 1};
672 return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
673}
674
675template <>
676EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) {
677 return vadd_f32(a, b);
678}
679template <>
680EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
681 return vaddq_f32(a, b);
682}
683template <>
684EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b) {
685 return vget_lane_s32(
686 vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
687}
688template <>
689EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) {
690 return vadd_s8(a, b);
691}
692template <>
693EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
694 return vaddq_s8(a, b);
695}
696template <>
697EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
698 return vget_lane_u32(
699 vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
700}
701template <>
702EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
703 return vadd_u8(a, b);
704}
705template <>
706EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
707 return vaddq_u8(a, b);
708}
709template <>
710EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) {
711 return vadd_s16(a, b);
712}
713template <>
714EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
715 return vaddq_s16(a, b);
716}
717template <>
718EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) {
719 return vadd_u16(a, b);
720}
721template <>
722EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
723 return vaddq_u16(a, b);
724}
725template <>
726EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) {
727 return vadd_s32(a, b);
728}
729template <>
730EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
731 return vaddq_s32(a, b);
732}
733template <>
734EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
735 return vadd_u32(a, b);
736}
737template <>
738EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
739 return vaddq_u32(a, b);
740}
741template <>
742EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
743 return vaddq_s64(a, b);
744}
745template <>
746EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
747 return vaddq_u64(a, b);
748}
749
750template <>
751EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) {
752 return vsub_f32(a, b);
753}
754template <>
755EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
756 return vsubq_f32(a, b);
757}
758template <>
759EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b) {
760 return vget_lane_s32(
761 vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
762}
763template <>
764EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) {
765 return vsub_s8(a, b);
766}
767template <>
768EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
769 return vsubq_s8(a, b);
770}
771template <>
772EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
773 return vget_lane_u32(
774 vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
775}
776template <>
777EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
778 return vsub_u8(a, b);
779}
780template <>
781EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
782 return vsubq_u8(a, b);
783}
784template <>
785EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) {
786 return vsub_s16(a, b);
787}
788template <>
789EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
790 return vsubq_s16(a, b);
791}
792template <>
793EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) {
794 return vsub_u16(a, b);
795}
796template <>
797EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
798 return vsubq_u16(a, b);
799}
800template <>
801EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) {
802 return vsub_s32(a, b);
803}
804template <>
805EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
806 return vsubq_s32(a, b);
807}
808template <>
809EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
810 return vsub_u32(a, b);
811}
812template <>
813EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
814 return vsubq_u32(a, b);
815}
816template <>
817EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
818 return vsubq_s64(a, b);
819}
820template <>
821EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
822 return vsubq_u64(a, b);
823}
824
825template <>
826EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
827template <>
828EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f& b) {
829 Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
830 return padd(a, pxor(mask, b));
831}
832template <>
833EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
834template <>
835EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
836 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
837 return padd(a, pxor(mask, b));
838}
839
840template <>
841EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) {
842 return vneg_f32(a);
843}
844template <>
845EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
846 return vnegq_f32(a);
847}
848template <>
849EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) {
850 return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
851}
852template <>
853EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) {
854 return vneg_s8(a);
855}
856template <>
857EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
858 return vnegq_s8(a);
859}
860template <>
861EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) {
862 return vneg_s16(a);
863}
864template <>
865EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
866 return vnegq_s16(a);
867}
868template <>
869EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) {
870 return vneg_s32(a);
871}
872template <>
873EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
874 return vnegq_s32(a);
875}
876template <>
877EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
878#if EIGEN_ARCH_ARM64
879 return vnegq_s64(a);
880#else
881 return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
882#endif
883}
884
885template <>
886EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) {
887 return a;
888}
889template <>
890EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
891 return a;
892}
893template <>
894EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) {
895 return a;
896}
897template <>
898EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) {
899 return a;
900}
901template <>
902EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
903 return a;
904}
905template <>
906EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) {
907 return a;
908}
909template <>
910EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) {
911 return a;
912}
913template <>
914EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
915 return a;
916}
917template <>
918EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) {
919 return a;
920}
921template <>
922EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
923 return a;
924}
925template <>
926EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) {
927 return a;
928}
929template <>
930EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
931 return a;
932}
933template <>
934EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) {
935 return a;
936}
937template <>
938EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
939 return a;
940}
941template <>
942EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) {
943 return a;
944}
945template <>
946EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
947 return a;
948}
949template <>
950EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
951 return a;
952}
953template <>
954EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
955 return a;
956}
957
958template <>
959EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) {
960 return vmul_f32(a, b);
961}
962template <>
963EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
964 return vmulq_f32(a, b);
965}
966template <>
967EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b) {
968 return vget_lane_s32(
969 vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
970}
971template <>
972EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) {
973 return vmul_s8(a, b);
974}
975template <>
976EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
977 return vmulq_s8(a, b);
978}
979template <>
980EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
981 return vget_lane_u32(
982 vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
983}
984template <>
985EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
986 return vmul_u8(a, b);
987}
988template <>
989EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
990 return vmulq_u8(a, b);
991}
992template <>
993EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) {
994 return vmul_s16(a, b);
995}
996template <>
997EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
998 return vmulq_s16(a, b);
999}
1000template <>
1001EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) {
1002 return vmul_u16(a, b);
1003}
1004template <>
1005EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1006 return vmulq_u16(a, b);
1007}
1008template <>
1009EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) {
1010 return vmul_s32(a, b);
1011}
1012template <>
1013EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1014 return vmulq_s32(a, b);
1015}
1016template <>
1017EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1018 return vmul_u32(a, b);
1019}
1020template <>
1021EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1022 return vmulq_u32(a, b);
1023}
1024template <>
1025EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
1026 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
1027 vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
1028}
1029template <>
1030EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1031 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
1032 vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
1033}
1034
1035template <>
1036EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) {
1037 eigen_assert(false && "packet integer division are not supported by NEON");
1038 return pset1<Packet4c>(0);
1039}
1040template <>
1041EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/) {
1042 eigen_assert(false && "packet integer division are not supported by NEON");
1043 return pset1<Packet8c>(0);
1044}
1045template <>
1046EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/) {
1047 eigen_assert(false && "packet integer division are not supported by NEON");
1048 return pset1<Packet16c>(0);
1049}
1050template <>
1051EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/) {
1052 eigen_assert(false && "packet integer division are not supported by NEON");
1053 return pset1<Packet4uc>(0);
1054}
1055template <>
1056EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/) {
1057 eigen_assert(false && "packet integer division are not supported by NEON");
1058 return pset1<Packet8uc>(0);
1059}
1060template <>
1061EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/) {
1062 eigen_assert(false && "packet integer division are not supported by NEON");
1063 return pset1<Packet16uc>(0);
1064}
1065template <>
1066EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/) {
1067 eigen_assert(false && "packet integer division are not supported by NEON");
1068 return pset1<Packet4s>(0);
1069}
1070template <>
1071EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
1072 eigen_assert(false && "packet integer division are not supported by NEON");
1073 return pset1<Packet8s>(0);
1074}
1075template <>
1076EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/) {
1077 eigen_assert(false && "packet integer division are not supported by NEON");
1078 return pset1<Packet4us>(0);
1079}
1080template <>
1081EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
1082 eigen_assert(false && "packet integer division are not supported by NEON");
1083 return pset1<Packet8us>(0);
1084}
1085template <>
1086EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/) {
1087 eigen_assert(false && "packet integer division are not supported by NEON");
1088 return pset1<Packet2i>(0);
1089}
1090template <>
1091EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
1092 eigen_assert(false && "packet integer division are not supported by NEON");
1093 return pset1<Packet4i>(0);
1094}
1095template <>
1096EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/) {
1097 eigen_assert(false && "packet integer division are not supported by NEON");
1098 return pset1<Packet2ui>(0);
1099}
1100template <>
1101EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
1102 eigen_assert(false && "packet integer division are not supported by NEON");
1103 return pset1<Packet4ui>(0);
1104}
1105template <>
1106EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
1107 eigen_assert(false && "packet integer division are not supported by NEON");
1108 return pset1<Packet2l>(0LL);
1109}
1110template <>
1111EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
1112 eigen_assert(false && "packet integer division are not supported by NEON");
1113 return pset1<Packet2ul>(0ULL);
1114}
1115
1116#ifdef EIGEN_VECTORIZE_FMA
1117template <>
1118EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1119 return vfmaq_f32(c, a, b);
1120}
1121template <>
1122EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1123 return vfma_f32(c, a, b);
1124}
1125template <>
1126EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1127 return vfmsq_f32(c, a, b);
1128}
1129template <>
1130EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1131 return vfms_f32(c, a, b);
1132}
1133#else
1134template <>
1135EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1136 return vmlaq_f32(c, a, b);
1137}
1138template <>
1139EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1140 return vmla_f32(c, a, b);
1141}
1142template <>
1143EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1144 return vmlsq_f32(c, a, b);
1145}
1146template <>
1147EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1148 return vmls_f32(c, a, b);
1149}
1150#endif
1151template <>
1152EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1153 return pnegate(pnmadd(a, b, c));
1154}
1155template <>
1156EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1157 return pnegate(pnmadd(a, b, c));
1158}
1159template <>
1160EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1161 return pnegate(pmadd(a, b, c));
1162}
1163template <>
1164EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
1165 return pnegate(pmadd(a, b, c));
1166}
1167
1168// No FMA instruction for int, so use MLA unconditionally.
1169template <>
1170EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) {
1171 return vget_lane_s32(
1172 vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
1173 vreinterpret_s8_s32(vdup_n_s32(b)))),
1174 0);
1175}
1176template <>
1177EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) {
1178 return vmla_s8(c, a, b);
1179}
1180template <>
1181EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
1182 return vmlaq_s8(c, a, b);
1183}
1184template <>
1185EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) {
1186 return vget_lane_u32(
1187 vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
1188 vreinterpret_u8_u32(vdup_n_u32(b)))),
1189 0);
1190}
1191template <>
1192EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) {
1193 return vmla_u8(c, a, b);
1194}
1195template <>
1196EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
1197 return vmlaq_u8(c, a, b);
1198}
1199template <>
1200EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) {
1201 return vmla_s16(c, a, b);
1202}
1203template <>
1204EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1205 return vmlaq_s16(c, a, b);
1206}
1207template <>
1208EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) {
1209 return vmla_u16(c, a, b);
1210}
1211template <>
1212EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1213 return vmlaq_u16(c, a, b);
1214}
1215template <>
1216EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) {
1217 return vmla_s32(c, a, b);
1218}
1219template <>
1220EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1221 return vmlaq_s32(c, a, b);
1222}
1223template <>
1224EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) {
1225 return vmla_u32(c, a, b);
1226}
1227template <>
1228EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
1229 return vmlaq_u32(c, a, b);
1230}
1231
1232template <>
1233EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b) {
1234 return vabd_f32(a, b);
1235}
1236template <>
1237EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
1238 return vabdq_f32(a, b);
1239}
1240template <>
1241EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b) {
1242 return vget_lane_s32(
1243 vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1244}
1245template <>
1246EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b) {
1247 return vabd_s8(a, b);
1248}
1249template <>
1250EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
1251 return vabdq_s8(a, b);
1252}
1253template <>
1254EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1255 return vget_lane_u32(
1256 vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1257}
1258template <>
1259EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1260 return vabd_u8(a, b);
1261}
1262template <>
1263EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1264 return vabdq_u8(a, b);
1265}
1266template <>
1267EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b) {
1268 return vabd_s16(a, b);
1269}
1270template <>
1271EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
1272 return vabdq_s16(a, b);
1273}
1274template <>
1275EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b) {
1276 return vabd_u16(a, b);
1277}
1278template <>
1279EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
1280 return vabdq_u16(a, b);
1281}
1282template <>
1283EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b) {
1284 return vabd_s32(a, b);
1285}
1286template <>
1287EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
1288 return vabdq_s32(a, b);
1289}
1290template <>
1291EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1292 return vabd_u32(a, b);
1293}
1294template <>
1295EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1296 return vabdq_u32(a, b);
1297}
1298
1299template <>
1300EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) {
1301 return vmin_f32(a, b);
1302}
1303template <>
1304EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1305 return vminq_f32(a, b);
1306}
1307
1308#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1309// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1310// systems).
1311template <>
1312EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1313 return vminnmq_f32(a, b);
1314}
1315template <>
1316EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1317 return vminnm_f32(a, b);
1318}
1319#endif
1320
1321template <>
1322EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1323 return pmin<Packet4f>(a, b);
1324}
1325
1326template <>
1327EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
1328 return pmin<Packet2f>(a, b);
1329}
1330
1331template <>
1332EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b) {
1333 return vget_lane_s32(
1334 vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1335}
1336template <>
1337EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) {
1338 return vmin_s8(a, b);
1339}
1340template <>
1341EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1342 return vminq_s8(a, b);
1343}
1344template <>
1345EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1346 return vget_lane_u32(
1347 vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1348}
1349template <>
1350EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1351 return vmin_u8(a, b);
1352}
1353template <>
1354EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1355 return vminq_u8(a, b);
1356}
1357template <>
1358EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) {
1359 return vmin_s16(a, b);
1360}
1361template <>
1362EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1363 return vminq_s16(a, b);
1364}
1365template <>
1366EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) {
1367 return vmin_u16(a, b);
1368}
1369template <>
1370EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1371 return vminq_u16(a, b);
1372}
1373template <>
1374EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) {
1375 return vmin_s32(a, b);
1376}
1377template <>
1378EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1379 return vminq_s32(a, b);
1380}
1381template <>
1382EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1383 return vmin_u32(a, b);
1384}
1385template <>
1386EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1387 return vminq_u32(a, b);
1388}
1389template <>
1390EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1391 return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1392 vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1393}
1394template <>
1395EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1396 return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1397 vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1398}
1399
1400template <>
1401EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) {
1402 return vmax_f32(a, b);
1403}
1404template <>
1405EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1406 return vmaxq_f32(a, b);
1407}
1408
1409#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
1410// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
1411// systems).
1412template <>
1413EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1414 return vmaxnmq_f32(a, b);
1415}
1416template <>
1417EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
1418 return vmaxnm_f32(a, b);
1419}
1420#endif
1421
1422template <>
1423EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1424 return pmax<Packet4f>(a, b);
1425}
1426
1427template <>
1428EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
1429 return pmax<Packet2f>(a, b);
1430}
1431
1432template <>
1433EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b) {
1434 return vget_lane_s32(
1435 vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1436}
1437template <>
1438EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) {
1439 return vmax_s8(a, b);
1440}
1441template <>
1442EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1443 return vmaxq_s8(a, b);
1444}
1445template <>
1446EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1447 return vget_lane_u32(
1448 vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1449}
1450template <>
1451EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1452 return vmax_u8(a, b);
1453}
1454template <>
1455EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1456 return vmaxq_u8(a, b);
1457}
1458template <>
1459EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) {
1460 return vmax_s16(a, b);
1461}
1462template <>
1463EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1464 return vmaxq_s16(a, b);
1465}
1466template <>
1467EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) {
1468 return vmax_u16(a, b);
1469}
1470template <>
1471EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1472 return vmaxq_u16(a, b);
1473}
1474template <>
1475EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) {
1476 return vmax_s32(a, b);
1477}
1478template <>
1479EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1480 return vmaxq_s32(a, b);
1481}
1482template <>
1483EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1484 return vmax_u32(a, b);
1485}
1486template <>
1487EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1488 return vmaxq_u32(a, b);
1489}
1490template <>
1491EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1492 return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
1493 vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
1494}
1495template <>
1496EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1497 return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
1498 vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
1499}
1500
1501template <>
1502EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b) {
1503 return vreinterpret_f32_u32(vcle_f32(a, b));
1504}
1505template <>
1506EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1507 return vreinterpretq_f32_u32(vcleq_f32(a, b));
1508}
1509template <>
1510EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b) {
1511 return vget_lane_s32(
1512 vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1513}
1514template <>
1515EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b) {
1516 return vreinterpret_s8_u8(vcle_s8(a, b));
1517}
1518template <>
1519EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
1520 return vreinterpretq_s8_u8(vcleq_s8(a, b));
1521}
1522template <>
1523EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1524 return vget_lane_u32(
1525 vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1526}
1527template <>
1528EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1529 return vcle_u8(a, b);
1530}
1531template <>
1532EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1533 return vcleq_u8(a, b);
1534}
1535template <>
1536EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b) {
1537 return vreinterpret_s16_u16(vcle_s16(a, b));
1538}
1539template <>
1540EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
1541 return vreinterpretq_s16_u16(vcleq_s16(a, b));
1542}
1543template <>
1544EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b) {
1545 return vcle_u16(a, b);
1546}
1547template <>
1548EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
1549 return vcleq_u16(a, b);
1550}
1551template <>
1552EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b) {
1553 return vreinterpret_s32_u32(vcle_s32(a, b));
1554}
1555template <>
1556EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
1557 return vreinterpretq_s32_u32(vcleq_s32(a, b));
1558}
1559template <>
1560EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1561 return vcle_u32(a, b);
1562}
1563template <>
1564EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1565 return vcleq_u32(a, b);
1566}
1567template <>
1568EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
1569#if EIGEN_ARCH_ARM64
1570 return vreinterpretq_s64_u64(vcleq_s64(a, b));
1571#else
1572 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1573 vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1574#endif
1575}
1576template <>
1577EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1578#if EIGEN_ARCH_ARM64
1579 return vcleq_u64(a, b);
1580#else
1581 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1582 vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1583#endif
1584}
1585
1586template <>
1587EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b) {
1588 return vreinterpret_f32_u32(vclt_f32(a, b));
1589}
1590template <>
1591EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1592 return vreinterpretq_f32_u32(vcltq_f32(a, b));
1593}
1594template <>
1595EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b) {
1596 return vget_lane_s32(
1597 vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1598}
1599template <>
1600EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b) {
1601 return vreinterpret_s8_u8(vclt_s8(a, b));
1602}
1603template <>
1604EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
1605 return vreinterpretq_s8_u8(vcltq_s8(a, b));
1606}
1607template <>
1608EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1609 return vget_lane_u32(
1610 vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1611}
1612template <>
1613EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1614 return vclt_u8(a, b);
1615}
1616template <>
1617EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1618 return vcltq_u8(a, b);
1619}
1620template <>
1621EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b) {
1622 return vreinterpret_s16_u16(vclt_s16(a, b));
1623}
1624template <>
1625EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
1626 return vreinterpretq_s16_u16(vcltq_s16(a, b));
1627}
1628template <>
1629EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b) {
1630 return vclt_u16(a, b);
1631}
1632template <>
1633EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
1634 return vcltq_u16(a, b);
1635}
1636template <>
1637EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b) {
1638 return vreinterpret_s32_u32(vclt_s32(a, b));
1639}
1640template <>
1641EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
1642 return vreinterpretq_s32_u32(vcltq_s32(a, b));
1643}
1644template <>
1645EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1646 return vclt_u32(a, b);
1647}
1648template <>
1649EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1650 return vcltq_u32(a, b);
1651}
1652template <>
1653EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
1654#if EIGEN_ARCH_ARM64
1655 return vreinterpretq_s64_u64(vcltq_s64(a, b));
1656#else
1657 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1658 vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1659#endif
1660}
1661template <>
1662EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1663#if EIGEN_ARCH_ARM64
1664 return vcltq_u64(a, b);
1665#else
1666 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1667 vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1668#endif
1669}
1670
1671template <>
1672EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b) {
1673 return vreinterpret_f32_u32(vceq_f32(a, b));
1674}
1675template <>
1676EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1677 return vreinterpretq_f32_u32(vceqq_f32(a, b));
1678}
1679template <>
1680EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b) {
1681 return vget_lane_s32(
1682 vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
1683}
1684template <>
1685EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b) {
1686 return vreinterpret_s8_u8(vceq_s8(a, b));
1687}
1688template <>
1689EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
1690 return vreinterpretq_s8_u8(vceqq_s8(a, b));
1691}
1692template <>
1693EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1694 return vget_lane_u32(
1695 vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
1696}
1697template <>
1698EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1699 return vceq_u8(a, b);
1700}
1701template <>
1702EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1703 return vceqq_u8(a, b);
1704}
1705template <>
1706EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b) {
1707 return vreinterpret_s16_u16(vceq_s16(a, b));
1708}
1709template <>
1710EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
1711 return vreinterpretq_s16_u16(vceqq_s16(a, b));
1712}
1713template <>
1714EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b) {
1715 return vceq_u16(a, b);
1716}
1717template <>
1718EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
1719 return vceqq_u16(a, b);
1720}
1721template <>
1722EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b) {
1723 return vreinterpret_s32_u32(vceq_s32(a, b));
1724}
1725template <>
1726EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
1727 return vreinterpretq_s32_u32(vceqq_s32(a, b));
1728}
1729template <>
1730EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1731 return vceq_u32(a, b);
1732}
1733template <>
1734EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1735 return vceqq_u32(a, b);
1736}
1737template <>
1738EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
1739#if EIGEN_ARCH_ARM64
1740 return vreinterpretq_s64_u64(vceqq_s64(a, b));
1741#else
1742 return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
1743 vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
1744#endif
1745}
1746template <>
1747EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1748#if EIGEN_ARCH_ARM64
1749 return vceqq_u64(a, b);
1750#else
1751 return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
1752 vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
1753#endif
1754}
1755
1756template <>
1757EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b) {
1758 return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
1759}
1760template <>
1761EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
1762 return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
1763}
1764
1765// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
1766template <>
1767EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b) {
1768 return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1769}
1770template <>
1771EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1772 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1773}
1774template <>
1775EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b) {
1776 return a & b;
1777}
1778template <>
1779EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b) {
1780 return vand_s8(a, b);
1781}
1782template <>
1783EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
1784 return vandq_s8(a, b);
1785}
1786template <>
1787EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1788 return a & b;
1789}
1790template <>
1791EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1792 return vand_u8(a, b);
1793}
1794template <>
1795EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1796 return vandq_u8(a, b);
1797}
1798template <>
1799EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) {
1800 return vand_s16(a, b);
1801}
1802template <>
1803EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
1804 return vandq_s16(a, b);
1805}
1806template <>
1807EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b) {
1808 return vand_u16(a, b);
1809}
1810template <>
1811EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1812 return vandq_u16(a, b);
1813}
1814template <>
1815EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) {
1816 return vand_s32(a, b);
1817}
1818template <>
1819EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1820 return vandq_s32(a, b);
1821}
1822template <>
1823EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1824 return vand_u32(a, b);
1825}
1826template <>
1827EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1828 return vandq_u32(a, b);
1829}
1830template <>
1831EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
1832 return vandq_s64(a, b);
1833}
1834template <>
1835EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1836 return vandq_u64(a, b);
1837}
1838
1839template <>
1840EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b) {
1841 return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1842}
1843template <>
1844EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1845 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1846}
1847template <>
1848EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b) {
1849 return a | b;
1850}
1851template <>
1852EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) {
1853 return vorr_s8(a, b);
1854}
1855template <>
1856EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
1857 return vorrq_s8(a, b);
1858}
1859template <>
1860EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1861 return a | b;
1862}
1863template <>
1864EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1865 return vorr_u8(a, b);
1866}
1867template <>
1868EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1869 return vorrq_u8(a, b);
1870}
1871template <>
1872EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b) {
1873 return vorr_s16(a, b);
1874}
1875template <>
1876EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1877 return vorrq_s16(a, b);
1878}
1879template <>
1880EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b) {
1881 return vorr_u16(a, b);
1882}
1883template <>
1884EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1885 return vorrq_u16(a, b);
1886}
1887template <>
1888EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) {
1889 return vorr_s32(a, b);
1890}
1891template <>
1892EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1893 return vorrq_s32(a, b);
1894}
1895template <>
1896EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1897 return vorr_u32(a, b);
1898}
1899template <>
1900EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1901 return vorrq_u32(a, b);
1902}
1903template <>
1904EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
1905 return vorrq_s64(a, b);
1906}
1907template <>
1908EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1909 return vorrq_u64(a, b);
1910}
1911
1912template <>
1913EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b) {
1914 return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1915}
1916template <>
1917EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1918 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1919}
1920template <>
1921EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b) {
1922 return a ^ b;
1923}
1924template <>
1925EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b) {
1926 return veor_s8(a, b);
1927}
1928template <>
1929EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
1930 return veorq_s8(a, b);
1931}
1932template <>
1933EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
1934 return a ^ b;
1935}
1936template <>
1937EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
1938 return veor_u8(a, b);
1939}
1940template <>
1941EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1942 return veorq_u8(a, b);
1943}
1944template <>
1945EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) {
1946 return veor_s16(a, b);
1947}
1948template <>
1949EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
1950 return veorq_s16(a, b);
1951}
1952template <>
1953EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b) {
1954 return veor_u16(a, b);
1955}
1956template <>
1957EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1958 return veorq_u16(a, b);
1959}
1960template <>
1961EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) {
1962 return veor_s32(a, b);
1963}
1964template <>
1965EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1966 return veorq_s32(a, b);
1967}
1968template <>
1969EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
1970 return veor_u32(a, b);
1971}
1972template <>
1973EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1974 return veorq_u32(a, b);
1975}
1976template <>
1977EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
1978 return veorq_s64(a, b);
1979}
1980template <>
1981EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1982 return veorq_u64(a, b);
1983}
1984
1985template <>
1986EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b) {
1987 return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
1988}
1989template <>
1990EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1991 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
1992}
1993template <>
1994EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b) {
1995 return a & ~b;
1996}
1997template <>
1998EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) {
1999 return vbic_s8(a, b);
2000}
2001template <>
2002EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
2003 return vbicq_s8(a, b);
2004}
2005template <>
2006EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
2007 return a & ~b;
2008}
2009template <>
2010EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
2011 return vbic_u8(a, b);
2012}
2013template <>
2014EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2015 return vbicq_u8(a, b);
2016}
2017template <>
2018EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b) {
2019 return vbic_s16(a, b);
2020}
2021template <>
2022EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
2023 return vbicq_s16(a, b);
2024}
2025template <>
2026EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b) {
2027 return vbic_u16(a, b);
2028}
2029template <>
2030EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
2031 return vbicq_u16(a, b);
2032}
2033template <>
2034EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b) {
2035 return vbic_s32(a, b);
2036}
2037template <>
2038EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
2039 return vbicq_s32(a, b);
2040}
2041template <>
2042EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
2043 return vbic_u32(a, b);
2044}
2045template <>
2046EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
2047 return vbicq_u32(a, b);
2048}
2049template <>
2050EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
2051 return vbicq_s64(a, b);
2052}
2053template <>
2054EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
2055 return vbicq_u64(a, b);
2056}
2057
2058template <int N>
2059EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
2060 return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2061}
2062template <int N>
2063EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
2064 return vshr_n_s8(a, N);
2065}
2066template <int N>
2067EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
2068 return vshrq_n_s8(a, N);
2069}
2070template <int N>
2071EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
2072 return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2073}
2074template <int N>
2075EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
2076 return vshr_n_u8(a, N);
2077}
2078template <int N>
2079EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
2080 return vshrq_n_u8(a, N);
2081}
2082template <int N>
2083EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
2084 return vshr_n_s16(a, N);
2085}
2086template <int N>
2087EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
2088 return vshrq_n_s16(a, N);
2089}
2090template <int N>
2091EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
2092 return vshr_n_u16(a, N);
2093}
2094template <int N>
2095EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
2096 return vshrq_n_u16(a, N);
2097}
2098template <int N>
2099EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
2100 return vshr_n_s32(a, N);
2101}
2102template <int N>
2103EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
2104 return vshrq_n_s32(a, N);
2105}
2106template <int N>
2107EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
2108 return vshr_n_u32(a, N);
2109}
2110template <int N>
2111EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
2112 return vshrq_n_u32(a, N);
2113}
2114template <int N>
2115EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
2116 return vshrq_n_s64(a, N);
2117}
2118template <int N>
2119EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
2120 return vshrq_n_u64(a, N);
2121}
2122
2123template <int N>
2124EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
2125 return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
2126}
2127template <int N>
2128EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
2129 return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
2130}
2131template <int N>
2132EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
2133 return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
2134}
2135template <int N>
2136EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
2137 return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
2138}
2139template <int N>
2140EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
2141 return vshr_n_u8(a, N);
2142}
2143template <int N>
2144EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
2145 return vshrq_n_u8(a, N);
2146}
2147template <int N>
2148EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
2149 return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
2150}
2151template <int N>
2152EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
2153 return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
2154}
2155template <int N>
2156EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
2157 return vshr_n_u16(a, N);
2158}
2159template <int N>
2160EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
2161 return vshrq_n_u16(a, N);
2162}
2163template <int N>
2164EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
2165 return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
2166}
2167template <int N>
2168EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
2169 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
2170}
2171template <int N>
2172EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
2173 return vshr_n_u32(a, N);
2174}
2175template <int N>
2176EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
2177 return vshrq_n_u32(a, N);
2178}
2179template <int N>
2180EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
2181 return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
2182}
2183template <int N>
2184EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
2185 return vshrq_n_u64(a, N);
2186}
2187
2188template <int N>
2189EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
2190 return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
2191}
2192template <int N>
2193EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
2194 return vshl_n_s8(a, N);
2195}
2196template <int N>
2197EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
2198 return vshlq_n_s8(a, N);
2199}
2200template <int N>
2201EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
2202 return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
2203}
2204template <int N>
2205EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
2206 return vshl_n_u8(a, N);
2207}
2208template <int N>
2209EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
2210 return vshlq_n_u8(a, N);
2211}
2212template <int N>
2213EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
2214 return vshl_n_s16(a, N);
2215}
2216template <int N>
2217EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
2218 return vshlq_n_s16(a, N);
2219}
2220template <int N>
2221EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
2222 return vshl_n_u16(a, N);
2223}
2224template <int N>
2225EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
2226 return vshlq_n_u16(a, N);
2227}
2228template <int N>
2229EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
2230 return vshl_n_s32(a, N);
2231}
2232template <int N>
2233EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
2234 return vshlq_n_s32(a, N);
2235}
2236template <int N>
2237EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
2238 return vshl_n_u32(a, N);
2239}
2240template <int N>
2241EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
2242 return vshlq_n_u32(a, N);
2243}
2244template <int N>
2245EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
2246 return vshlq_n_s64(a, N);
2247}
2248template <int N>
2249EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
2250 return vshlq_n_u64(a, N);
2251}
2252
2253template <>
2254EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
2255 EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(from));
2256}
2257template <>
2258EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
2259 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(from));
2260}
2261template <>
2262EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
2263 Packet4c res;
2264 memcpy(&res, from, sizeof(Packet4c));
2265 return res;
2266}
2267template <>
2268EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
2269 EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(from));
2270}
2271template <>
2272EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
2273 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(from));
2274}
2275template <>
2276EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
2277 Packet4uc res;
2278 memcpy(&res, from, sizeof(Packet4uc));
2279 return res;
2280}
2281template <>
2282EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
2283 EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(from));
2284}
2285template <>
2286EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
2287 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(from));
2288}
2289template <>
2290EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
2291 EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(from));
2292}
2293template <>
2294EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
2295 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(from));
2296}
2297template <>
2298EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
2299 EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(from));
2300}
2301template <>
2302EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
2303 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(from));
2304}
2305template <>
2306EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
2307 EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(from));
2308}
2309template <>
2310EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
2311 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(from));
2312}
2313template <>
2314EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
2315 EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(from));
2316}
2317template <>
2318EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
2319 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(from));
2320}
2321template <>
2322EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
2323 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(from));
2324}
2325template <>
2326EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
2327 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(from));
2328}
2329
2330template <>
2331EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from) {
2332 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
2333}
2334template <>
2335EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
2336 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
2337}
2338template <>
2339EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from) {
2340 Packet4c res;
2341 memcpy(&res, from, sizeof(Packet4c));
2342 return res;
2343}
2344template <>
2345EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from) {
2346 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
2347}
2348template <>
2349EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
2350 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
2351}
2352template <>
2353EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from) {
2354 Packet4uc res;
2355 memcpy(&res, from, sizeof(Packet4uc));
2356 return res;
2357}
2358template <>
2359EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from) {
2360 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
2361}
2362template <>
2363EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
2364 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
2365}
2366template <>
2367EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from) {
2368 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
2369}
2370template <>
2371EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
2372 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
2373}
2374template <>
2375EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from) {
2376 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
2377}
2378template <>
2379EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
2380 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
2381}
2382template <>
2383EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from) {
2384 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
2385}
2386template <>
2387EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
2388 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
2389}
2390template <>
2391EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from) {
2392 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
2393}
2394template <>
2395EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
2396 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
2397}
2398template <>
2399EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
2400 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
2401}
2402template <>
2403EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
2404 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
2405}
2406
2407template <>
2408EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from) {
2409 return vld1_dup_f32(from);
2410}
2411template <>
2412EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
2413 return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
2414}
2415template <>
2416EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from) {
2417 const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
2418 return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
2419}
2420template <>
2421EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from) {
2422 const int8x8_t a = vld1_s8(from);
2423 return vzip_s8(a, a).val[0];
2424}
2425template <>
2426EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
2427 const int8x8_t a = vld1_s8(from);
2428 const int8x8x2_t b = vzip_s8(a, a);
2429 return vcombine_s8(b.val[0], b.val[1]);
2430}
2431template <>
2432EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from) {
2433 const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
2434 return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
2435}
2436template <>
2437EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from) {
2438 const uint8x8_t a = vld1_u8(from);
2439 return vzip_u8(a, a).val[0];
2440}
2441template <>
2442EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
2443 const uint8x8_t a = vld1_u8(from);
2444 const uint8x8x2_t b = vzip_u8(a, a);
2445 return vcombine_u8(b.val[0], b.val[1]);
2446}
2447template <>
2448EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from) {
2449 return vreinterpret_s16_u32(
2450 vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
2451}
2452template <>
2453EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
2454 const int16x4_t a = vld1_s16(from);
2455 const int16x4x2_t b = vzip_s16(a, a);
2456 return vcombine_s16(b.val[0], b.val[1]);
2457}
2458template <>
2459EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from) {
2460 return vreinterpret_u16_u32(
2461 vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
2462}
2463template <>
2464EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
2465 const uint16x4_t a = vld1_u16(from);
2466 const uint16x4x2_t b = vzip_u16(a, a);
2467 return vcombine_u16(b.val[0], b.val[1]);
2468}
2469template <>
2470EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from) {
2471 return vld1_dup_s32(from);
2472}
2473template <>
2474EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
2475 return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
2476}
2477template <>
2478EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from) {
2479 return vld1_dup_u32(from);
2480}
2481template <>
2482EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
2483 return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
2484}
2485template <>
2486EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
2487 return vld1q_dup_s64(from);
2488}
2489template <>
2490EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
2491 return vld1q_dup_u64(from);
2492}
2493
2494template <>
2495EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
2496 return vld1q_dup_f32(from);
2497}
2498template <>
2499EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
2500 return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
2501}
2502template <>
2503EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
2504 return vreinterpret_s8_u32(
2505 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2506}
2507template <>
2508EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
2509 const int8x8_t a = vreinterpret_s8_u32(
2510 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
2511 const int8x8_t b = vreinterpret_s8_u32(
2512 vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
2513 return vcombine_s8(a, b);
2514}
2515template <>
2516EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
2517 return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
2518}
2519template <>
2520EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
2521 return vreinterpret_u8_u32(
2522 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2523}
2524template <>
2525EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
2526 const uint8x8_t a = vreinterpret_u8_u32(
2527 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
2528 const uint8x8_t b = vreinterpret_u8_u32(
2529 vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
2530 return vcombine_u8(a, b);
2531}
2532template <>
2533EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
2534 return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
2535}
2536template <>
2537EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
2538 return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
2539}
2540template <>
2541EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
2542 return vld1q_dup_s32(from);
2543}
2544template <>
2545EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
2546 return vld1q_dup_u32(from);
2547}
2548
2549template <>
2550EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
2551 EIGEN_DEBUG_ALIGNED_STORE vst1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(to), from);
2552}
2553template <>
2554EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
2555 EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(to), from);
2556}
2557template <>
2558EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
2559 memcpy(to, &from, sizeof(from));
2560}
2561template <>
2562EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
2563 EIGEN_DEBUG_ALIGNED_STORE vst1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(to), from);
2564}
2565template <>
2566EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
2567 EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(to), from);
2568}
2569template <>
2570EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
2571 memcpy(to, &from, sizeof(from));
2572}
2573template <>
2574EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
2575 EIGEN_DEBUG_ALIGNED_STORE vst1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(to), from);
2576}
2577template <>
2578EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
2579 EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(to), from);
2580}
2581template <>
2582EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
2583 EIGEN_DEBUG_ALIGNED_STORE vst1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(to), from);
2584}
2585template <>
2586EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
2587 EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(to), from);
2588}
2589template <>
2590EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
2591 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(to), from);
2592}
2593template <>
2594EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
2595 EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(to), from);
2596}
2597template <>
2598EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
2599 EIGEN_DEBUG_ALIGNED_STORE vst1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(to), from);
2600}
2601template <>
2602EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
2603 EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(to), from);
2604}
2605template <>
2606EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
2607 EIGEN_DEBUG_ALIGNED_STORE vst1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(to), from);
2608}
2609template <>
2610EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
2611 EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(to), from);
2612}
2613template <>
2614EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
2615 EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(to), from);
2616}
2617template <>
2618EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
2619 EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(to), from);
2620}
2621
2622template <>
2623EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
2624 EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
2625}
2626template <>
2627EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
2628 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
2629}
2630template <>
2631EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from) {
2632 memcpy(to, &from, sizeof(from));
2633}
2634template <>
2635EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from) {
2636 EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
2637}
2638template <>
2639EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
2640 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
2641}
2642template <>
2643EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from) {
2644 memcpy(to, &from, sizeof(from));
2645}
2646template <>
2647EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from) {
2648 EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
2649}
2650template <>
2651EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
2652 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
2653}
2654template <>
2655EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from) {
2656 EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
2657}
2658template <>
2659EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
2660 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
2661}
2662template <>
2663EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from) {
2664 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
2665}
2666template <>
2667EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
2668 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
2669}
2670template <>
2671EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from) {
2672 EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
2673}
2674template <>
2675EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
2676 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
2677}
2678template <>
2679EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from) {
2680 EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
2681}
2682template <>
2683EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
2684 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
2685}
2686template <>
2687EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
2688 EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
2689}
2690template <>
2691EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
2692 EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
2693}
2694
2695template <>
2696EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride) {
2697 Packet2f res = vld1_dup_f32(from);
2698 res = vld1_lane_f32(from + 1 * stride, res, 1);
2699 return res;
2700}
2701template <>
2702EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
2703 Packet4f res = vld1q_dup_f32(from);
2704 res = vld1q_lane_f32(from + 1 * stride, res, 1);
2705 res = vld1q_lane_f32(from + 2 * stride, res, 2);
2706 res = vld1q_lane_f32(from + 3 * stride, res, 3);
2707 return res;
2708}
2709template <>
2710EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride) {
2711 Packet4c res;
2712 for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
2713 return res;
2714}
2715template <>
2716EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride) {
2717 Packet8c res = vld1_dup_s8(from);
2718 res = vld1_lane_s8(from + 1 * stride, res, 1);
2719 res = vld1_lane_s8(from + 2 * stride, res, 2);
2720 res = vld1_lane_s8(from + 3 * stride, res, 3);
2721 res = vld1_lane_s8(from + 4 * stride, res, 4);
2722 res = vld1_lane_s8(from + 5 * stride, res, 5);
2723 res = vld1_lane_s8(from + 6 * stride, res, 6);
2724 res = vld1_lane_s8(from + 7 * stride, res, 7);
2725 return res;
2726}
2727template <>
2728EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
2729 Packet16c res = vld1q_dup_s8(from);
2730 res = vld1q_lane_s8(from + 1 * stride, res, 1);
2731 res = vld1q_lane_s8(from + 2 * stride, res, 2);
2732 res = vld1q_lane_s8(from + 3 * stride, res, 3);
2733 res = vld1q_lane_s8(from + 4 * stride, res, 4);
2734 res = vld1q_lane_s8(from + 5 * stride, res, 5);
2735 res = vld1q_lane_s8(from + 6 * stride, res, 6);
2736 res = vld1q_lane_s8(from + 7 * stride, res, 7);
2737 res = vld1q_lane_s8(from + 8 * stride, res, 8);
2738 res = vld1q_lane_s8(from + 9 * stride, res, 9);
2739 res = vld1q_lane_s8(from + 10 * stride, res, 10);
2740 res = vld1q_lane_s8(from + 11 * stride, res, 11);
2741 res = vld1q_lane_s8(from + 12 * stride, res, 12);
2742 res = vld1q_lane_s8(from + 13 * stride, res, 13);
2743 res = vld1q_lane_s8(from + 14 * stride, res, 14);
2744 res = vld1q_lane_s8(from + 15 * stride, res, 15);
2745 return res;
2746}
2747template <>
2748EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride) {
2749 Packet4uc res;
2750 for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
2751 return res;
2752}
2753template <>
2754EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride) {
2755 Packet8uc res = vld1_dup_u8(from);
2756 res = vld1_lane_u8(from + 1 * stride, res, 1);
2757 res = vld1_lane_u8(from + 2 * stride, res, 2);
2758 res = vld1_lane_u8(from + 3 * stride, res, 3);
2759 res = vld1_lane_u8(from + 4 * stride, res, 4);
2760 res = vld1_lane_u8(from + 5 * stride, res, 5);
2761 res = vld1_lane_u8(from + 6 * stride, res, 6);
2762 res = vld1_lane_u8(from + 7 * stride, res, 7);
2763 return res;
2764}
2765template <>
2766EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
2767 Packet16uc res = vld1q_dup_u8(from);
2768 res = vld1q_lane_u8(from + 1 * stride, res, 1);
2769 res = vld1q_lane_u8(from + 2 * stride, res, 2);
2770 res = vld1q_lane_u8(from + 3 * stride, res, 3);
2771 res = vld1q_lane_u8(from + 4 * stride, res, 4);
2772 res = vld1q_lane_u8(from + 5 * stride, res, 5);
2773 res = vld1q_lane_u8(from + 6 * stride, res, 6);
2774 res = vld1q_lane_u8(from + 7 * stride, res, 7);
2775 res = vld1q_lane_u8(from + 8 * stride, res, 8);
2776 res = vld1q_lane_u8(from + 9 * stride, res, 9);
2777 res = vld1q_lane_u8(from + 10 * stride, res, 10);
2778 res = vld1q_lane_u8(from + 11 * stride, res, 11);
2779 res = vld1q_lane_u8(from + 12 * stride, res, 12);
2780 res = vld1q_lane_u8(from + 13 * stride, res, 13);
2781 res = vld1q_lane_u8(from + 14 * stride, res, 14);
2782 res = vld1q_lane_u8(from + 15 * stride, res, 15);
2783 return res;
2784}
2785template <>
2786EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride) {
2787 Packet4s res = vld1_dup_s16(from);
2788 res = vld1_lane_s16(from + 1 * stride, res, 1);
2789 res = vld1_lane_s16(from + 2 * stride, res, 2);
2790 res = vld1_lane_s16(from + 3 * stride, res, 3);
2791 return res;
2792}
2793template <>
2794EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
2795 Packet8s res = vld1q_dup_s16(from);
2796 res = vld1q_lane_s16(from + 1 * stride, res, 1);
2797 res = vld1q_lane_s16(from + 2 * stride, res, 2);
2798 res = vld1q_lane_s16(from + 3 * stride, res, 3);
2799 res = vld1q_lane_s16(from + 4 * stride, res, 4);
2800 res = vld1q_lane_s16(from + 5 * stride, res, 5);
2801 res = vld1q_lane_s16(from + 6 * stride, res, 6);
2802 res = vld1q_lane_s16(from + 7 * stride, res, 7);
2803 return res;
2804}
2805template <>
2806EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride) {
2807 Packet4us res = vld1_dup_u16(from);
2808 res = vld1_lane_u16(from + 1 * stride, res, 1);
2809 res = vld1_lane_u16(from + 2 * stride, res, 2);
2810 res = vld1_lane_u16(from + 3 * stride, res, 3);
2811 return res;
2812}
2813template <>
2814EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
2815 Packet8us res = vld1q_dup_u16(from);
2816 res = vld1q_lane_u16(from + 1 * stride, res, 1);
2817 res = vld1q_lane_u16(from + 2 * stride, res, 2);
2818 res = vld1q_lane_u16(from + 3 * stride, res, 3);
2819 res = vld1q_lane_u16(from + 4 * stride, res, 4);
2820 res = vld1q_lane_u16(from + 5 * stride, res, 5);
2821 res = vld1q_lane_u16(from + 6 * stride, res, 6);
2822 res = vld1q_lane_u16(from + 7 * stride, res, 7);
2823 return res;
2824}
2825template <>
2826EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride) {
2827 Packet2i res = vld1_dup_s32(from);
2828 res = vld1_lane_s32(from + 1 * stride, res, 1);
2829 return res;
2830}
2831template <>
2832EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
2833 Packet4i res = vld1q_dup_s32(from);
2834 res = vld1q_lane_s32(from + 1 * stride, res, 1);
2835 res = vld1q_lane_s32(from + 2 * stride, res, 2);
2836 res = vld1q_lane_s32(from + 3 * stride, res, 3);
2837 return res;
2838}
2839template <>
2840EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride) {
2841 Packet2ui res = vld1_dup_u32(from);
2842 res = vld1_lane_u32(from + 1 * stride, res, 1);
2843 return res;
2844}
2845template <>
2846EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
2847 Packet4ui res = vld1q_dup_u32(from);
2848 res = vld1q_lane_u32(from + 1 * stride, res, 1);
2849 res = vld1q_lane_u32(from + 2 * stride, res, 2);
2850 res = vld1q_lane_u32(from + 3 * stride, res, 3);
2851 return res;
2852}
2853template <>
2854EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
2855 Packet2l res = vld1q_dup_s64(from);
2856 res = vld1q_lane_s64(from + 1 * stride, res, 1);
2857 return res;
2858}
2859template <>
2860EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
2861 Packet2ul res = vld1q_dup_u64(from);
2862 res = vld1q_lane_u64(from + 1 * stride, res, 1);
2863 return res;
2864}
2865
2866template <>
2867EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride) {
2868 vst1_lane_f32(to + stride * 0, from, 0);
2869 vst1_lane_f32(to + stride * 1, from, 1);
2870}
2871template <>
2872EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
2873 vst1q_lane_f32(to + stride * 0, from, 0);
2874 vst1q_lane_f32(to + stride * 1, from, 1);
2875 vst1q_lane_f32(to + stride * 2, from, 2);
2876 vst1q_lane_f32(to + stride * 3, from, 3);
2877}
2878template <>
2879EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride) {
2880 for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
2881}
2882template <>
2883EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride) {
2884 vst1_lane_s8(to + stride * 0, from, 0);
2885 vst1_lane_s8(to + stride * 1, from, 1);
2886 vst1_lane_s8(to + stride * 2, from, 2);
2887 vst1_lane_s8(to + stride * 3, from, 3);
2888 vst1_lane_s8(to + stride * 4, from, 4);
2889 vst1_lane_s8(to + stride * 5, from, 5);
2890 vst1_lane_s8(to + stride * 6, from, 6);
2891 vst1_lane_s8(to + stride * 7, from, 7);
2892}
2893template <>
2894EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
2895 Index stride) {
2896 vst1q_lane_s8(to + stride * 0, from, 0);
2897 vst1q_lane_s8(to + stride * 1, from, 1);
2898 vst1q_lane_s8(to + stride * 2, from, 2);
2899 vst1q_lane_s8(to + stride * 3, from, 3);
2900 vst1q_lane_s8(to + stride * 4, from, 4);
2901 vst1q_lane_s8(to + stride * 5, from, 5);
2902 vst1q_lane_s8(to + stride * 6, from, 6);
2903 vst1q_lane_s8(to + stride * 7, from, 7);
2904 vst1q_lane_s8(to + stride * 8, from, 8);
2905 vst1q_lane_s8(to + stride * 9, from, 9);
2906 vst1q_lane_s8(to + stride * 10, from, 10);
2907 vst1q_lane_s8(to + stride * 11, from, 11);
2908 vst1q_lane_s8(to + stride * 12, from, 12);
2909 vst1q_lane_s8(to + stride * 13, from, 13);
2910 vst1q_lane_s8(to + stride * 14, from, 14);
2911 vst1q_lane_s8(to + stride * 15, from, 15);
2912}
2913template <>
2914EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from,
2915 Index stride) {
2916 for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
2917}
2918template <>
2919EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from,
2920 Index stride) {
2921 vst1_lane_u8(to + stride * 0, from, 0);
2922 vst1_lane_u8(to + stride * 1, from, 1);
2923 vst1_lane_u8(to + stride * 2, from, 2);
2924 vst1_lane_u8(to + stride * 3, from, 3);
2925 vst1_lane_u8(to + stride * 4, from, 4);
2926 vst1_lane_u8(to + stride * 5, from, 5);
2927 vst1_lane_u8(to + stride * 6, from, 6);
2928 vst1_lane_u8(to + stride * 7, from, 7);
2929}
2930template <>
2931EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
2932 Index stride) {
2933 vst1q_lane_u8(to + stride * 0, from, 0);
2934 vst1q_lane_u8(to + stride * 1, from, 1);
2935 vst1q_lane_u8(to + stride * 2, from, 2);
2936 vst1q_lane_u8(to + stride * 3, from, 3);
2937 vst1q_lane_u8(to + stride * 4, from, 4);
2938 vst1q_lane_u8(to + stride * 5, from, 5);
2939 vst1q_lane_u8(to + stride * 6, from, 6);
2940 vst1q_lane_u8(to + stride * 7, from, 7);
2941 vst1q_lane_u8(to + stride * 8, from, 8);
2942 vst1q_lane_u8(to + stride * 9, from, 9);
2943 vst1q_lane_u8(to + stride * 10, from, 10);
2944 vst1q_lane_u8(to + stride * 11, from, 11);
2945 vst1q_lane_u8(to + stride * 12, from, 12);
2946 vst1q_lane_u8(to + stride * 13, from, 13);
2947 vst1q_lane_u8(to + stride * 14, from, 14);
2948 vst1q_lane_u8(to + stride * 15, from, 15);
2949}
2950template <>
2951EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from,
2952 Index stride) {
2953 vst1_lane_s16(to + stride * 0, from, 0);
2954 vst1_lane_s16(to + stride * 1, from, 1);
2955 vst1_lane_s16(to + stride * 2, from, 2);
2956 vst1_lane_s16(to + stride * 3, from, 3);
2957}
2958template <>
2959EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
2960 Index stride) {
2961 vst1q_lane_s16(to + stride * 0, from, 0);
2962 vst1q_lane_s16(to + stride * 1, from, 1);
2963 vst1q_lane_s16(to + stride * 2, from, 2);
2964 vst1q_lane_s16(to + stride * 3, from, 3);
2965 vst1q_lane_s16(to + stride * 4, from, 4);
2966 vst1q_lane_s16(to + stride * 5, from, 5);
2967 vst1q_lane_s16(to + stride * 6, from, 6);
2968 vst1q_lane_s16(to + stride * 7, from, 7);
2969}
2970template <>
2971EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from,
2972 Index stride) {
2973 vst1_lane_u16(to + stride * 0, from, 0);
2974 vst1_lane_u16(to + stride * 1, from, 1);
2975 vst1_lane_u16(to + stride * 2, from, 2);
2976 vst1_lane_u16(to + stride * 3, from, 3);
2977}
2978template <>
2979EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
2980 Index stride) {
2981 vst1q_lane_u16(to + stride * 0, from, 0);
2982 vst1q_lane_u16(to + stride * 1, from, 1);
2983 vst1q_lane_u16(to + stride * 2, from, 2);
2984 vst1q_lane_u16(to + stride * 3, from, 3);
2985 vst1q_lane_u16(to + stride * 4, from, 4);
2986 vst1q_lane_u16(to + stride * 5, from, 5);
2987 vst1q_lane_u16(to + stride * 6, from, 6);
2988 vst1q_lane_u16(to + stride * 7, from, 7);
2989}
2990template <>
2991EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from,
2992 Index stride) {
2993 vst1_lane_s32(to + stride * 0, from, 0);
2994 vst1_lane_s32(to + stride * 1, from, 1);
2995}
2996template <>
2997EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
2998 Index stride) {
2999 vst1q_lane_s32(to + stride * 0, from, 0);
3000 vst1q_lane_s32(to + stride * 1, from, 1);
3001 vst1q_lane_s32(to + stride * 2, from, 2);
3002 vst1q_lane_s32(to + stride * 3, from, 3);
3003}
3004template <>
3005EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from,
3006 Index stride) {
3007 vst1_lane_u32(to + stride * 0, from, 0);
3008 vst1_lane_u32(to + stride * 1, from, 1);
3009}
3010template <>
3011EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
3012 Index stride) {
3013 vst1q_lane_u32(to + stride * 0, from, 0);
3014 vst1q_lane_u32(to + stride * 1, from, 1);
3015 vst1q_lane_u32(to + stride * 2, from, 2);
3016 vst1q_lane_u32(to + stride * 3, from, 3);
3017}
3018template <>
3019EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
3020 Index stride) {
3021 vst1q_lane_s64(to + stride * 0, from, 0);
3022 vst1q_lane_s64(to + stride * 1, from, 1);
3023}
3024template <>
3025EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
3026 Index stride) {
3027 vst1q_lane_u64(to + stride * 0, from, 0);
3028 vst1q_lane_u64(to + stride * 1, from, 1);
3029}
3030
3031template <>
3032EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
3033 EIGEN_ARM_PREFETCH(addr);
3034}
3035template <>
3036EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
3037 EIGEN_ARM_PREFETCH(addr);
3038}
3039template <>
3040EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
3041 EIGEN_ARM_PREFETCH(addr);
3042}
3043template <>
3044EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
3045 EIGEN_ARM_PREFETCH(addr);
3046}
3047template <>
3048EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
3049 EIGEN_ARM_PREFETCH(addr);
3050}
3051template <>
3052EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
3053 EIGEN_ARM_PREFETCH(addr);
3054}
3055template <>
3056EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
3057 EIGEN_ARM_PREFETCH(addr);
3058}
3059template <>
3060EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
3061 EIGEN_ARM_PREFETCH(addr);
3062}
3063template <>
3064EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
3065 EIGEN_ARM_PREFETCH(addr);
3066}
3067
3068template <>
3069EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) {
3070 return vget_lane_f32(a, 0);
3071}
3072template <>
3073EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
3074 return vgetq_lane_f32(a, 0);
3075}
3076template <>
3077EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) {
3078 return static_cast<int8_t>(a & 0xff);
3079}
3080template <>
3081EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) {
3082 return vget_lane_s8(a, 0);
3083}
3084template <>
3085EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
3086 return vgetq_lane_s8(a, 0);
3087}
3088template <>
3089EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) {
3090 return static_cast<uint8_t>(a & 0xff);
3091}
3092template <>
3093EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) {
3094 return vget_lane_u8(a, 0);
3095}
3096template <>
3097EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
3098 return vgetq_lane_u8(a, 0);
3099}
3100template <>
3101EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) {
3102 return vget_lane_s16(a, 0);
3103}
3104template <>
3105EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
3106 return vgetq_lane_s16(a, 0);
3107}
3108template <>
3109EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) {
3110 return vget_lane_u16(a, 0);
3111}
3112template <>
3113EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
3114 return vgetq_lane_u16(a, 0);
3115}
3116template <>
3117EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) {
3118 return vget_lane_s32(a, 0);
3119}
3120template <>
3121EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
3122 return vgetq_lane_s32(a, 0);
3123}
3124template <>
3125EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) {
3126 return vget_lane_u32(a, 0);
3127}
3128template <>
3129EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
3130 return vgetq_lane_u32(a, 0);
3131}
3132template <>
3133EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
3134 return vgetq_lane_s64(a, 0);
3135}
3136template <>
3137EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
3138 return vgetq_lane_u64(a, 0);
3139}
3140
3141template <>
3142EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) {
3143 return vrev64_f32(a);
3144}
3145template <>
3146EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
3147 const float32x4_t a_r64 = vrev64q_f32(a);
3148 return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
3149}
3150template <>
3151EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) {
3152 return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3153}
3154template <>
3155EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) {
3156 return vrev64_s8(a);
3157}
3158template <>
3159EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
3160 const int8x16_t a_r64 = vrev64q_s8(a);
3161 return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
3162}
3163template <>
3164EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) {
3165 return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
3166}
3167template <>
3168EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) {
3169 return vrev64_u8(a);
3170}
3171template <>
3172EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
3173 const uint8x16_t a_r64 = vrev64q_u8(a);
3174 return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
3175}
3176template <>
3177EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) {
3178 return vrev64_s16(a);
3179}
3180template <>
3181EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
3182 const int16x8_t a_r64 = vrev64q_s16(a);
3183 return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
3184}
3185template <>
3186EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) {
3187 return vrev64_u16(a);
3188}
3189template <>
3190EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
3191 const uint16x8_t a_r64 = vrev64q_u16(a);
3192 return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
3193}
3194template <>
3195EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) {
3196 return vrev64_s32(a);
3197}
3198template <>
3199EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
3200 const int32x4_t a_r64 = vrev64q_s32(a);
3201 return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
3202}
3203template <>
3204EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) {
3205 return vrev64_u32(a);
3206}
3207template <>
3208EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
3209 const uint32x4_t a_r64 = vrev64q_u32(a);
3210 return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
3211}
3212template <>
3213EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
3214 return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
3215}
3216template <>
3217EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
3218 return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
3219}
3220
3221template <>
3222EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) {
3223 return vabs_f32(a);
3224}
3225template <>
3226EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
3227 return vabsq_f32(a);
3228}
3229template <>
3230EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a) {
3231 return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
3232}
3233template <>
3234EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) {
3235 return vabs_s8(a);
3236}
3237template <>
3238EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
3239 return vabsq_s8(a);
3240}
3241template <>
3242EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) {
3243 return a;
3244}
3245template <>
3246EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) {
3247 return a;
3248}
3249template <>
3250EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
3251 return a;
3252}
3253template <>
3254EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) {
3255 return vabs_s16(a);
3256}
3257template <>
3258EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
3259 return vabsq_s16(a);
3260}
3261template <>
3262EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) {
3263 return a;
3264}
3265template <>
3266EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
3267 return a;
3268}
3269template <>
3270EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) {
3271 return vabs_s32(a);
3272}
3273template <>
3274EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
3275 return vabsq_s32(a);
3276}
3277template <>
3278EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) {
3279 return a;
3280}
3281template <>
3282EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
3283 return a;
3284}
3285template <>
3286EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
3287#if EIGEN_ARCH_ARM64
3288 return vabsq_s64(a);
3289#else
3290 return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
3291#endif
3292}
3293template <>
3294EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
3295 return a;
3296}
3297
3298template <>
3299EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
3300 return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
3301}
3302template <>
3303EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
3304 return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
3305}
3306
3307template <>
3308EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent) {
3309 return pfrexp_generic(a, exponent);
3310}
3311template <>
3312EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
3313 return pfrexp_generic(a, exponent);
3314}
3315
3316template <>
3317EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent) {
3318 return pldexp_generic(a, exponent);
3319}
3320template <>
3321EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
3322 return pldexp_generic(a, exponent);
3323}
3324
3325#if EIGEN_ARCH_ARM64
3326template <>
3327EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
3328 return vaddv_f32(a);
3329}
3330template <>
3331EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
3332 return vaddvq_f32(a);
3333}
3334#else
3335template <>
3336EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
3337 return vget_lane_f32(vpadd_f32(a, a), 0);
3338}
3339template <>
3340EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
3341 const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
3342 return vget_lane_f32(vpadd_f32(sum, sum), 0);
3343}
3344#endif
3345template <>
3346EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a) {
3347 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3348 int8x8_t sum = vpadd_s8(a_dup, a_dup);
3349 sum = vpadd_s8(sum, sum);
3350 return vget_lane_s8(sum, 0);
3351}
3352#if EIGEN_ARCH_ARM64
3353template <>
3354EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
3355 return vaddv_s8(a);
3356}
3357template <>
3358EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
3359 return vaddvq_s8(a);
3360}
3361#else
3362template <>
3363EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
3364 int8x8_t sum = vpadd_s8(a, a);
3365 sum = vpadd_s8(sum, sum);
3366 sum = vpadd_s8(sum, sum);
3367 return vget_lane_s8(sum, 0);
3368}
3369template <>
3370EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
3371 int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
3372 sum = vpadd_s8(sum, sum);
3373 sum = vpadd_s8(sum, sum);
3374 sum = vpadd_s8(sum, sum);
3375 return vget_lane_s8(sum, 0);
3376}
3377#endif
3378template <>
3379EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a) {
3380 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3381 uint8x8_t sum = vpadd_u8(a_dup, a_dup);
3382 sum = vpadd_u8(sum, sum);
3383 return vget_lane_u8(sum, 0);
3384}
3385#if EIGEN_ARCH_ARM64
3386template <>
3387EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
3388 return vaddv_u8(a);
3389}
3390template <>
3391EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
3392 return vaddvq_u8(a);
3393}
3394template <>
3395EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
3396 return vaddv_s16(a);
3397}
3398template <>
3399EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
3400 return vaddvq_s16(a);
3401}
3402template <>
3403EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
3404 return vaddv_u16(a);
3405}
3406template <>
3407EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
3408 return vaddvq_u16(a);
3409}
3410template <>
3411EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
3412 return vaddv_s32(a);
3413}
3414template <>
3415EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
3416 return vaddvq_s32(a);
3417}
3418template <>
3419EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
3420 return vaddv_u32(a);
3421}
3422template <>
3423EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
3424 return vaddvq_u32(a);
3425}
3426template <>
3427EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
3428 return vaddvq_s64(a);
3429}
3430template <>
3431EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
3432 return vaddvq_u64(a);
3433}
3434#else
3435template <>
3436EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
3437 uint8x8_t sum = vpadd_u8(a, a);
3438 sum = vpadd_u8(sum, sum);
3439 sum = vpadd_u8(sum, sum);
3440 return vget_lane_u8(sum, 0);
3441}
3442template <>
3443EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
3444 uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
3445 sum = vpadd_u8(sum, sum);
3446 sum = vpadd_u8(sum, sum);
3447 sum = vpadd_u8(sum, sum);
3448 return vget_lane_u8(sum, 0);
3449}
3450template <>
3451EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
3452 const int16x4_t sum = vpadd_s16(a, a);
3453 return vget_lane_s16(vpadd_s16(sum, sum), 0);
3454}
3455template <>
3456EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
3457 int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
3458 sum = vpadd_s16(sum, sum);
3459 sum = vpadd_s16(sum, sum);
3460 return vget_lane_s16(sum, 0);
3461}
3462template <>
3463EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
3464 const uint16x4_t sum = vpadd_u16(a, a);
3465 return vget_lane_u16(vpadd_u16(sum, sum), 0);
3466}
3467template <>
3468EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
3469 uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
3470 sum = vpadd_u16(sum, sum);
3471 sum = vpadd_u16(sum, sum);
3472 return vget_lane_u16(sum, 0);
3473}
3474template <>
3475EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
3476 return vget_lane_s32(vpadd_s32(a, a), 0);
3477}
3478template <>
3479EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
3480 const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
3481 return vget_lane_s32(vpadd_s32(sum, sum), 0);
3482}
3483template <>
3484EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
3485 return vget_lane_u32(vpadd_u32(a, a), 0);
3486}
3487template <>
3488EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
3489 const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
3490 return vget_lane_u32(vpadd_u32(sum, sum), 0);
3491}
3492template <>
3493EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
3494 return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
3495}
3496template <>
3497EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
3498 return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
3499}
3500#endif
3501
3502template <>
3503EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
3504 return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
3505}
3506template <>
3507EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
3508 return vadd_s8(vget_high_s8(a), vget_low_s8(a));
3509}
3510template <>
3511EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
3512 return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
3513}
3514template <>
3515EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
3516 return vadd_u8(vget_high_u8(a), vget_low_u8(a));
3517}
3518template <>
3519EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
3520 return vadd_s16(vget_high_s16(a), vget_low_s16(a));
3521}
3522template <>
3523EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
3524 return vadd_u16(vget_high_u16(a), vget_low_u16(a));
3525}
3526
3527// Other reduction functions:
3528// mul
3529template <>
3530EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a) {
3531 return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
3532}
3533template <>
3534EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
3535 return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
3536}
3537template <>
3538EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a) {
3539 int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
3540 prod = vmul_s8(prod, vrev16_s8(prod));
3541 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
3542}
3543template <>
3544EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a) {
3545 int8x8_t prod = vmul_s8(a, vrev16_s8(a));
3546 prod = vmul_s8(prod, vrev32_s8(prod));
3547 return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
3548}
3549template <>
3550EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
3551 return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
3552}
3553template <>
3554EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a) {
3555 uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
3556 prod = vmul_u8(prod, vrev16_u8(prod));
3557 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
3558}
3559template <>
3560EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a) {
3561 uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
3562 prod = vmul_u8(prod, vrev32_u8(prod));
3563 return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
3564}
3565template <>
3566EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
3567 return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
3568}
3569template <>
3570EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a) {
3571 const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
3572 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3573}
3574template <>
3575EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
3576 int16x4_t prod;
3577
3578 // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3579 prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
3580 // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3581 prod = vmul_s16(prod, vrev32_s16(prod));
3582 // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3583 return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
3584}
3585template <>
3586EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a) {
3587 const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
3588 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3589}
3590template <>
3591EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
3592 uint16x4_t prod;
3593
3594 // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
3595 prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
3596 // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
3597 prod = vmul_u16(prod, vrev32_u16(prod));
3598 // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
3599 return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
3600}
3601template <>
3602EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a) {
3603 return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
3604}
3605template <>
3606EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
3607 return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
3608}
3609template <>
3610EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a) {
3611 return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
3612}
3613template <>
3614EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
3615 return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
3616}
3617template <>
3618EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
3619 return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
3620}
3621template <>
3622EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
3623 return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
3624}
3625
3626// min
3627#if EIGEN_ARCH_ARM64
3628template <>
3629EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
3630 return vminv_f32(a);
3631}
3632template <>
3633EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
3634 return vminvq_f32(a);
3635}
3636#else
3637template <>
3638EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
3639 return vget_lane_f32(vpmin_f32(a, a), 0);
3640}
3641template <>
3642EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
3643 const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
3644 return vget_lane_f32(vpmin_f32(min, min), 0);
3645}
3646#endif
3647template <>
3648EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a) {
3649 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3650 int8x8_t min = vpmin_s8(a_dup, a_dup);
3651 min = vpmin_s8(min, min);
3652 return vget_lane_s8(min, 0);
3653}
3654#if EIGEN_ARCH_ARM64
3655template <>
3656EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
3657 return vminv_s8(a);
3658}
3659template <>
3660EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
3661 return vminvq_s8(a);
3662}
3663#else
3664template <>
3665EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
3666 int8x8_t min = vpmin_s8(a, a);
3667 min = vpmin_s8(min, min);
3668 min = vpmin_s8(min, min);
3669 return vget_lane_s8(min, 0);
3670}
3671template <>
3672EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
3673 int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
3674 min = vpmin_s8(min, min);
3675 min = vpmin_s8(min, min);
3676 min = vpmin_s8(min, min);
3677 return vget_lane_s8(min, 0);
3678}
3679#endif
3680template <>
3681EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a) {
3682 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3683 uint8x8_t min = vpmin_u8(a_dup, a_dup);
3684 min = vpmin_u8(min, min);
3685 return vget_lane_u8(min, 0);
3686}
3687#if EIGEN_ARCH_ARM64
3688template <>
3689EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
3690 return vminv_u8(a);
3691}
3692template <>
3693EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
3694 return vminvq_u8(a);
3695}
3696template <>
3697EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
3698 return vminv_s16(a);
3699}
3700template <>
3701EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
3702 return vminvq_s16(a);
3703}
3704template <>
3705EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
3706 return vminv_u16(a);
3707}
3708template <>
3709EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
3710 return vminvq_u16(a);
3711}
3712template <>
3713EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
3714 return vminv_s32(a);
3715}
3716template <>
3717EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
3718 return vminvq_s32(a);
3719}
3720template <>
3721EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
3722 return vminv_u32(a);
3723}
3724template <>
3725EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
3726 return vminvq_u32(a);
3727}
3728#else
3729template <>
3730EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
3731 uint8x8_t min = vpmin_u8(a, a);
3732 min = vpmin_u8(min, min);
3733 min = vpmin_u8(min, min);
3734 return vget_lane_u8(min, 0);
3735}
3736template <>
3737EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
3738 uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
3739 min = vpmin_u8(min, min);
3740 min = vpmin_u8(min, min);
3741 min = vpmin_u8(min, min);
3742 return vget_lane_u8(min, 0);
3743}
3744template <>
3745EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
3746 const int16x4_t min = vpmin_s16(a, a);
3747 return vget_lane_s16(vpmin_s16(min, min), 0);
3748}
3749template <>
3750EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
3751 int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
3752 min = vpmin_s16(min, min);
3753 min = vpmin_s16(min, min);
3754 return vget_lane_s16(min, 0);
3755}
3756template <>
3757EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
3758 const uint16x4_t min = vpmin_u16(a, a);
3759 return vget_lane_u16(vpmin_u16(min, min), 0);
3760}
3761template <>
3762EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
3763 uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
3764 min = vpmin_u16(min, min);
3765 min = vpmin_u16(min, min);
3766 return vget_lane_u16(min, 0);
3767}
3768template <>
3769EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
3770 return vget_lane_s32(vpmin_s32(a, a), 0);
3771}
3772template <>
3773EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
3774 const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
3775 return vget_lane_s32(vpmin_s32(min, min), 0);
3776}
3777template <>
3778EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
3779 return vget_lane_u32(vpmin_u32(a, a), 0);
3780}
3781template <>
3782EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
3783 const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
3784 return vget_lane_u32(vpmin_u32(min, min), 0);
3785}
3786#endif
3787template <>
3788EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
3789 return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3790}
3791template <>
3792EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
3793 return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3794}
3795
3796// max
3797#if EIGEN_ARCH_ARM64
3798template <>
3799EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
3800 return vmaxv_f32(a);
3801}
3802template <>
3803EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
3804 return vmaxvq_f32(a);
3805}
3806#else
3807template <>
3808EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
3809 return vget_lane_f32(vpmax_f32(a, a), 0);
3810}
3811template <>
3812EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
3813 const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
3814 return vget_lane_f32(vpmax_f32(max, max), 0);
3815}
3816#endif
3817template <>
3818EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a) {
3819 const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
3820 int8x8_t max = vpmax_s8(a_dup, a_dup);
3821 max = vpmax_s8(max, max);
3822 return vget_lane_s8(max, 0);
3823}
3824#if EIGEN_ARCH_ARM64
3825template <>
3826EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
3827 return vmaxv_s8(a);
3828}
3829template <>
3830EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
3831 return vmaxvq_s8(a);
3832}
3833#else
3834template <>
3835EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
3836 int8x8_t max = vpmax_s8(a, a);
3837 max = vpmax_s8(max, max);
3838 max = vpmax_s8(max, max);
3839 return vget_lane_s8(max, 0);
3840}
3841template <>
3842EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
3843 int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
3844 max = vpmax_s8(max, max);
3845 max = vpmax_s8(max, max);
3846 max = vpmax_s8(max, max);
3847 return vget_lane_s8(max, 0);
3848}
3849#endif
3850template <>
3851EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a) {
3852 const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
3853 uint8x8_t max = vpmax_u8(a_dup, a_dup);
3854 max = vpmax_u8(max, max);
3855 return vget_lane_u8(max, 0);
3856}
3857#if EIGEN_ARCH_ARM64
3858template <>
3859EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
3860 return vmaxv_u8(a);
3861}
3862template <>
3863EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
3864 return vmaxvq_u8(a);
3865}
3866template <>
3867EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
3868 return vmaxv_s16(a);
3869}
3870template <>
3871EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
3872 return vmaxvq_s16(a);
3873}
3874template <>
3875EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
3876 return vmaxv_u16(a);
3877}
3878template <>
3879EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
3880 return vmaxvq_u16(a);
3881}
3882template <>
3883EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
3884 return vmaxv_s32(a);
3885}
3886template <>
3887EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
3888 return vmaxvq_s32(a);
3889}
3890template <>
3891EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
3892 return vmaxv_u32(a);
3893}
3894template <>
3895EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
3896 return vmaxvq_u32(a);
3897}
3898#else
3899template <>
3900EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
3901 uint8x8_t max = vpmax_u8(a, a);
3902 max = vpmax_u8(max, max);
3903 max = vpmax_u8(max, max);
3904 return vget_lane_u8(max, 0);
3905}
3906template <>
3907EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
3908 uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
3909 max = vpmax_u8(max, max);
3910 max = vpmax_u8(max, max);
3911 max = vpmax_u8(max, max);
3912 return vget_lane_u8(max, 0);
3913}
3914template <>
3915EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
3916 const int16x4_t max = vpmax_s16(a, a);
3917 return vget_lane_s16(vpmax_s16(max, max), 0);
3918}
3919template <>
3920EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
3921 int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
3922 max = vpmax_s16(max, max);
3923 max = vpmax_s16(max, max);
3924 return vget_lane_s16(max, 0);
3925}
3926template <>
3927EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
3928 const uint16x4_t max = vpmax_u16(a, a);
3929 return vget_lane_u16(vpmax_u16(max, max), 0);
3930}
3931template <>
3932EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
3933 uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
3934 max = vpmax_u16(max, max);
3935 max = vpmax_u16(max, max);
3936 return vget_lane_u16(max, 0);
3937}
3938template <>
3939EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
3940 return vget_lane_s32(vpmax_s32(a, a), 0);
3941}
3942template <>
3943EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
3944 const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
3945 return vget_lane_s32(vpmax_s32(max, max), 0);
3946}
3947template <>
3948EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
3949 return vget_lane_u32(vpmax_u32(a, a), 0);
3950}
3951template <>
3952EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
3953 const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
3954 return vget_lane_u32(vpmax_u32(max, max), 0);
3955}
3956#endif
3957template <>
3958EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
3959 return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
3960}
3961template <>
3962EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
3963 return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
3964}
3965
3966template <>
3967EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
3968 uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
3969 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
3970}
3971
3972// Helpers for ptranspose.
3973namespace detail {
3974
3975template <typename Packet>
3976void zip_in_place(Packet& p1, Packet& p2);
3977
3978template <>
3979EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
3980 const float32x2x2_t tmp = vzip_f32(p1, p2);
3981 p1 = tmp.val[0];
3982 p2 = tmp.val[1];
3983}
3984
3985template <>
3986EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
3987 const float32x4x2_t tmp = vzipq_f32(p1, p2);
3988 p1 = tmp.val[0];
3989 p2 = tmp.val[1];
3990}
3991
3992template <>
3993EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
3994 const int8x8x2_t tmp = vzip_s8(p1, p2);
3995 p1 = tmp.val[0];
3996 p2 = tmp.val[1];
3997}
3998
3999template <>
4000EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
4001 const int8x16x2_t tmp = vzipq_s8(p1, p2);
4002 p1 = tmp.val[0];
4003 p2 = tmp.val[1];
4004}
4005
4006template <>
4007EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
4008 const uint8x8x2_t tmp = vzip_u8(p1, p2);
4009 p1 = tmp.val[0];
4010 p2 = tmp.val[1];
4011}
4012
4013template <>
4014EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
4015 const uint8x16x2_t tmp = vzipq_u8(p1, p2);
4016 p1 = tmp.val[0];
4017 p2 = tmp.val[1];
4018}
4019
4020template <>
4021EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
4022 const int32x2x2_t tmp = vzip_s32(p1, p2);
4023 p1 = tmp.val[0];
4024 p2 = tmp.val[1];
4025}
4026
4027template <>
4028EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
4029 const int32x4x2_t tmp = vzipq_s32(p1, p2);
4030 p1 = tmp.val[0];
4031 p2 = tmp.val[1];
4032}
4033
4034template <>
4035EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
4036 const uint32x2x2_t tmp = vzip_u32(p1, p2);
4037 p1 = tmp.val[0];
4038 p2 = tmp.val[1];
4039}
4040
4041template <>
4042EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
4043 const uint32x4x2_t tmp = vzipq_u32(p1, p2);
4044 p1 = tmp.val[0];
4045 p2 = tmp.val[1];
4046}
4047
4048template <>
4049EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
4050 const int16x4x2_t tmp = vzip_s16(p1, p2);
4051 p1 = tmp.val[0];
4052 p2 = tmp.val[1];
4053}
4054
4055template <>
4056EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
4057 const int16x8x2_t tmp = vzipq_s16(p1, p2);
4058 p1 = tmp.val[0];
4059 p2 = tmp.val[1];
4060}
4061
4062template <>
4063EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
4064 const uint16x4x2_t tmp = vzip_u16(p1, p2);
4065 p1 = tmp.val[0];
4066 p2 = tmp.val[1];
4067}
4068
4069template <>
4070EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
4071 const uint16x8x2_t tmp = vzipq_u16(p1, p2);
4072 p1 = tmp.val[0];
4073 p2 = tmp.val[1];
4074}
4075
4076template <typename Packet>
4077EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
4078 zip_in_place(kernel.packet[0], kernel.packet[1]);
4079}
4080
4081template <typename Packet>
4082EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
4083 zip_in_place(kernel.packet[0], kernel.packet[2]);
4084 zip_in_place(kernel.packet[1], kernel.packet[3]);
4085 zip_in_place(kernel.packet[0], kernel.packet[1]);
4086 zip_in_place(kernel.packet[2], kernel.packet[3]);
4087}
4088
4089template <typename Packet>
4090EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
4091 zip_in_place(kernel.packet[0], kernel.packet[4]);
4092 zip_in_place(kernel.packet[1], kernel.packet[5]);
4093 zip_in_place(kernel.packet[2], kernel.packet[6]);
4094 zip_in_place(kernel.packet[3], kernel.packet[7]);
4095
4096 zip_in_place(kernel.packet[0], kernel.packet[2]);
4097 zip_in_place(kernel.packet[1], kernel.packet[3]);
4098 zip_in_place(kernel.packet[4], kernel.packet[6]);
4099 zip_in_place(kernel.packet[5], kernel.packet[7]);
4100
4101 zip_in_place(kernel.packet[0], kernel.packet[1]);
4102 zip_in_place(kernel.packet[2], kernel.packet[3]);
4103 zip_in_place(kernel.packet[4], kernel.packet[5]);
4104 zip_in_place(kernel.packet[6], kernel.packet[7]);
4105}
4106
4107template <typename Packet>
4108EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
4109 EIGEN_UNROLL_LOOP
4110 for (int i = 0; i < 4; ++i) {
4111 const int m = (1 << i);
4112 EIGEN_UNROLL_LOOP
4113 for (int j = 0; j < m; ++j) {
4114 const int n = (1 << (3 - i));
4115 EIGEN_UNROLL_LOOP
4116 for (int k = 0; k < n; ++k) {
4117 const int idx = 2 * j * n + k;
4118 zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
4119 }
4120 }
4121 }
4122}
4123
4124} // namespace detail
4125
4126EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
4127 detail::ptranspose_impl(kernel);
4128}
4129EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
4130 detail::ptranspose_impl(kernel);
4131}
4132
4133EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
4134 const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
4135 const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
4136
4137 const int8x8x2_t zip8 = vzip_s8(a, b);
4138 const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
4139
4140 kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
4141 kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
4142 kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
4143 kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
4144}
4145EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
4146 detail::ptranspose_impl(kernel);
4147}
4148EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
4149 detail::ptranspose_impl(kernel);
4150}
4151EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
4152 detail::ptranspose_impl(kernel);
4153}
4154EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
4155 detail::ptranspose_impl(kernel);
4156}
4157EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
4158 detail::ptranspose_impl(kernel);
4159}
4160
4161EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
4162 const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
4163 const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
4164
4165 const uint8x8x2_t zip8 = vzip_u8(a, b);
4166 const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
4167
4168 kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
4169 kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
4170 kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
4171 kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
4172}
4173EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
4174 detail::ptranspose_impl(kernel);
4175}
4176EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
4177 detail::ptranspose_impl(kernel);
4178}
4179EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
4180 detail::ptranspose_impl(kernel);
4181}
4182EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
4183 detail::ptranspose_impl(kernel);
4184}
4185EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
4186 detail::ptranspose_impl(kernel);
4187}
4188
4189EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
4190 detail::ptranspose_impl(kernel);
4191}
4192EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
4193 detail::ptranspose_impl(kernel);
4194}
4195EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
4196 detail::ptranspose_impl(kernel);
4197}
4198
4199EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
4200 detail::ptranspose_impl(kernel);
4201}
4202EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
4203 detail::ptranspose_impl(kernel);
4204}
4205EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
4206 detail::ptranspose_impl(kernel);
4207}
4208
4209EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
4210 detail::ptranspose_impl(kernel);
4211}
4212EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
4213 detail::ptranspose_impl(kernel);
4214}
4215EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
4216 detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
4217}
4218EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
4219 detail::ptranspose_impl(kernel);
4220}
4221
4222EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
4223#if EIGEN_ARCH_ARM64
4224 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
4225 kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
4226 kernel.packet[0] = tmp1;
4227#else
4228 const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
4229 {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
4230
4231 kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
4232 kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
4233#endif
4234}
4235EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
4236#if EIGEN_ARCH_ARM64
4237 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
4238 kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
4239 kernel.packet[0] = tmp1;
4240#else
4241 const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
4242 {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
4243
4244 kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
4245 kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
4246#endif
4247}
4248
4249template <>
4250EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(const Packet2f& mask, const Packet2f& a, const Packet2f& b) {
4251 return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
4252}
4253template <>
4254EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
4255 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
4256}
4257template <>
4258EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) {
4259 return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
4260}
4261template <>
4262EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
4263 return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
4264}
4265template <>
4266EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) {
4267 return vbsl_u8(mask, a, b);
4268}
4269template <>
4270EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
4271 const Packet16uc& b) {
4272 return vbslq_u8(mask, a, b);
4273}
4274template <>
4275EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) {
4276 return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
4277}
4278template <>
4279EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
4280 return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
4281}
4282template <>
4283EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) {
4284 return vbsl_u16(mask, a, b);
4285}
4286template <>
4287EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
4288 return vbslq_u16(mask, a, b);
4289}
4290template <>
4291EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) {
4292 return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
4293}
4294template <>
4295EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
4296 return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
4297}
4298template <>
4299EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) {
4300 return vbsl_u32(mask, a, b);
4301}
4302template <>
4303EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
4304 return vbslq_u32(mask, a, b);
4305}
4306template <>
4307EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
4308 return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
4309}
4310template <>
4311EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
4312 return vbslq_u64(mask, a, b);
4313}
4314
4315// Use armv8 rounding intinsics if available.
4316#if EIGEN_ARCH_ARMV8
4317template <>
4318EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
4319 return vrndn_f32(a);
4320}
4321
4322template <>
4323EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
4324 return vrndnq_f32(a);
4325}
4326
4327template <>
4328EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
4329 return vrndm_f32(a);
4330}
4331
4332template <>
4333EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
4334 return vrndmq_f32(a);
4335}
4336
4337template <>
4338EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
4339 return vrndp_f32(a);
4340}
4341
4342template <>
4343EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
4344 return vrndpq_f32(a);
4345}
4346
4347template <>
4348EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(const Packet2f& a) {
4349 return vrnda_f32(a);
4350}
4351
4352template <>
4353EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
4354 return vrndaq_f32(a);
4355}
4356
4357template <>
4358EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(const Packet2f& a) {
4359 return vrnd_f32(a);
4360}
4361
4362template <>
4363EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
4364 return vrndq_f32(a);
4365}
4366#endif
4367
4374template <>
4375EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
4376 uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
4377 uint8x8_t res = vdup_n_u8(0);
4378 uint8x8_t add = vdup_n_u8(0x8);
4379 for (int i = 0; i < 4; i++) {
4380 const uint8x8_t temp = vorr_u8(res, add);
4381 res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
4382 add = vshr_n_u8(add, 1);
4383 }
4384 return vget_lane_u32(vreinterpret_u32_u8(res), 0);
4385}
4387template <>
4388EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
4389 uint8x8_t res = vdup_n_u8(0);
4390 uint8x8_t add = vdup_n_u8(0x8);
4391 for (int i = 0; i < 4; i++) {
4392 const uint8x8_t temp = vorr_u8(res, add);
4393 res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
4394 add = vshr_n_u8(add, 1);
4395 }
4396 return res;
4397}
4399template <>
4400EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
4401 uint8x16_t res = vdupq_n_u8(0);
4402 uint8x16_t add = vdupq_n_u8(0x8);
4403 for (int i = 0; i < 4; i++) {
4404 const uint8x16_t temp = vorrq_u8(res, add);
4405 res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
4406 add = vshrq_n_u8(add, 1);
4407 }
4408 return res;
4409}
4411template <>
4412EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
4413 uint16x4_t res = vdup_n_u16(0);
4414 uint16x4_t add = vdup_n_u16(0x80);
4415 for (int i = 0; i < 8; i++) {
4416 const uint16x4_t temp = vorr_u16(res, add);
4417 res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
4418 add = vshr_n_u16(add, 1);
4419 }
4420 return res;
4421}
4423template <>
4424EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
4425 uint16x8_t res = vdupq_n_u16(0);
4426 uint16x8_t add = vdupq_n_u16(0x80);
4427 for (int i = 0; i < 8; i++) {
4428 const uint16x8_t temp = vorrq_u16(res, add);
4429 res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
4430 add = vshrq_n_u16(add, 1);
4431 }
4432 return res;
4433}
4435template <>
4436EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
4437 uint32x2_t res = vdup_n_u32(0);
4438 uint32x2_t add = vdup_n_u32(0x8000);
4439 for (int i = 0; i < 16; i++) {
4440 const uint32x2_t temp = vorr_u32(res, add);
4441 res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
4442 add = vshr_n_u32(add, 1);
4443 }
4444 return res;
4445}
4447template <>
4448EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
4449 uint32x4_t res = vdupq_n_u32(0);
4450 uint32x4_t add = vdupq_n_u32(0x8000);
4451 for (int i = 0; i < 16; i++) {
4452 const uint32x4_t temp = vorrq_u32(res, add);
4453 res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
4454 add = vshrq_n_u32(add, 1);
4455 }
4456 return res;
4457}
4458
4459EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
4460 // Compute approximate reciprocal sqrt.
4461 // Does not correctly handle +/- 0 or +inf
4462 float32x4_t result = vrsqrteq_f32(a);
4463 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4464 result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
4465 return result;
4466}
4467
4468EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
4469 // Compute approximate reciprocal sqrt.
4470 // Does not correctly handle +/- 0 or +inf
4471 float32x2_t result = vrsqrte_f32(a);
4472 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4473 result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
4474 return result;
4475}
4476
4477template <typename Packet>
4478Packet prsqrt_float_common(const Packet& a) {
4479 const Packet cst_zero = pzero(a);
4480 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4481 Packet return_zero = pcmp_eq(a, cst_inf);
4482 Packet return_inf = pcmp_eq(a, cst_zero);
4483 Packet result = prsqrt_float_unsafe(a);
4484 result = pselect(return_inf, por(cst_inf, a), result);
4485 result = pandnot(result, return_zero);
4486 return result;
4487}
4488
4489template <>
4490EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
4491 return prsqrt_float_common(a);
4492}
4493
4494template <>
4495EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
4496 return prsqrt_float_common(a);
4497}
4498
4499template <>
4500EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
4501 // Compute approximate reciprocal.
4502 float32x4_t result = vrecpeq_f32(a);
4503 result = vmulq_f32(vrecpsq_f32(a, result), result);
4504 result = vmulq_f32(vrecpsq_f32(a, result), result);
4505 return result;
4506}
4507
4508template <>
4509EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) {
4510 // Compute approximate reciprocal.
4511 float32x2_t result = vrecpe_f32(a);
4512 result = vmul_f32(vrecps_f32(a, result), result);
4513 result = vmul_f32(vrecps_f32(a, result), result);
4514 return result;
4515}
4516
4517// Unfortunately vsqrt_f32 is only available for A64.
4518#if EIGEN_ARCH_ARM64
4519template <>
4520EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
4521 return vsqrtq_f32(a);
4522}
4523
4524template <>
4525EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
4526 return vsqrt_f32(a);
4527}
4528
4529template <>
4530EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) {
4531 return vdivq_f32(a, b);
4532}
4533
4534template <>
4535EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) {
4536 return vdiv_f32(a, b);
4537}
4538#else
4539template <typename Packet>
4540EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
4541 const Packet cst_zero = pzero(a);
4542 const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
4543
4544 Packet result = pmul(a, prsqrt_float_unsafe(a));
4545 Packet a_is_zero = pcmp_eq(a, cst_zero);
4546 Packet a_is_inf = pcmp_eq(a, cst_inf);
4547 Packet return_a = por(a_is_zero, a_is_inf);
4548
4549 result = pselect(return_a, a, result);
4550 return result;
4551}
4552
4553template <>
4554EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
4555 return psqrt_float_common(a);
4556}
4557
4558template <>
4559EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
4560 return psqrt_float_common(a);
4561}
4562
4563template <typename Packet>
4564EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
4565 // if b is large, NEON intrinsics will flush preciprocal(b) to zero
4566 // avoid underflow with the following manipulation:
4567 // a / b = f * (a * reciprocal(f * b))
4568
4569 const Packet cst_one = pset1<Packet>(1.0f);
4570 const Packet cst_quarter = pset1<Packet>(0.25f);
4571 const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
4572
4573 Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
4574 Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
4575 Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
4576 return result;
4577}
4578
4579template <>
4580EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
4581 return pdiv_float_common(a, b);
4582}
4583
4584template <>
4585EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
4586 return pdiv_float_common(a, b);
4587}
4588#endif
4589
4590//---------- bfloat16 ----------
4591// TODO: Add support for native armv8.6-a bfloat16_t
4592
4593// TODO: Guard if we have native bfloat16 support
4594typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
4595
4596template <>
4597struct is_arithmetic<Packet4bf> {
4598 enum { value = true };
4599};
4600
4601template <>
4602struct packet_traits<bfloat16> : default_packet_traits {
4603 typedef Packet4bf type;
4604 typedef Packet4bf half;
4605 enum {
4606 Vectorizable = 1,
4607 AlignedOnScalar = 1,
4608 size = 4,
4609
4610 HasCmp = 1,
4611 HasAdd = 1,
4612 HasSub = 1,
4613 HasShift = 1,
4614 HasMul = 1,
4615 HasNegate = 1,
4616 HasAbs = 1,
4617 HasArg = 0,
4618 HasAbsDiff = 1,
4619 HasMin = 1,
4620 HasMax = 1,
4621 HasConj = 1,
4622 HasSetLinear = 1,
4623 HasDiv = 1,
4624 HasSin = EIGEN_FAST_MATH,
4625 HasCos = EIGEN_FAST_MATH,
4626 HasLog = 1,
4627 HasExp = 1,
4628 HasSqrt = 0,
4629 HasTanh = EIGEN_FAST_MATH,
4630 HasErf = EIGEN_FAST_MATH,
4631 HasBessel = 0, // Issues with accuracy.
4632 HasNdtri = 0
4633 };
4634};
4635
4636template <>
4637struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
4638
4639namespace detail {
4640template <>
4641EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
4642 const uint16x4x2_t tmp = vzip_u16(p1, p2);
4643 p1 = tmp.val[0];
4644 p2 = tmp.val[1];
4645}
4646} // namespace detail
4647
4648EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) {
4649 // See the scalar implementation in BFloat16.h for a comprehensible explanation
4650 // of this fast rounding algorithm
4651 Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
4652
4653 // lsb = (input >> 16) & 1
4654 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
4655
4656 // rounding_bias = 0x7fff + lsb
4657 Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
4658
4659 // input += rounding_bias
4660 input = vaddq_u32(input, rounding_bias);
4661
4662 // input = input >> 16
4663 input = vshrq_n_u32(input, 16);
4664
4665 // Replace float-nans by bfloat16-nans, that is 0x7fc0
4666 const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
4667 const Packet4ui mask = vceqq_f32(p, p);
4668 input = vbslq_u32(mask, input, bf16_nan);
4669
4670 // output = static_cast<uint16_t>(input)
4671 return vmovn_u32(input);
4672}
4673
4674EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) {
4675 return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
4676}
4677
4678EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
4679
4680template <>
4681EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
4682 return Packet4bf(pset1<Packet4us>(from.value));
4683}
4684
4685template <>
4686EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
4687 return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
4688}
4689
4690template <>
4691EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
4692 return Packet4bf(
4693 pload<Packet4us>(reinterpret_cast<const uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(from))));
4694}
4695
4696template <>
4697EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
4698 return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4699}
4700
4701template <>
4702EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
4703 EIGEN_DEBUG_ALIGNED_STORE vst1_u16(
4704 reinterpret_cast<uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(to)), from);
4705}
4706
4707template <>
4708EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from) {
4709 EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
4710}
4711
4712template <>
4713EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from) {
4714 return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
4715}
4716
4717template <>
4718EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
4719 return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
4720}
4721
4722template <>
4723EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4724 return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4725}
4726template <>
4727EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4728 return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4729}
4730
4731template <>
4732EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4733 return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4734}
4735
4736template <>
4737EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4738 return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4739}
4740template <>
4741EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4742 return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4743}
4744
4745template <>
4746EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4747 return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4748}
4749
4750template <>
4751EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a) {
4752 return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
4753}
4754
4755template <>
4756EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a, const Packet4bf& b) {
4757 return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
4758}
4759
4760template <>
4761EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a, const Packet4bf& b) {
4762 return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
4763}
4764
4765template <>
4766EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a, const Packet4bf& b) {
4767 return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
4768}
4769
4770template <>
4771EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a, const Packet4bf& b) {
4772 return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
4773}
4774
4775template <>
4776EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) {
4777 return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
4778}
4779
4780template <>
4781EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a) {
4782 return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
4783}
4784
4785template <>
4786EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a) {
4787 return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
4788}
4789
4790template <>
4791EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
4792 return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
4793}
4794
4795template <>
4796EIGEN_STRONG_INLINE Packet4bf pround<Packet4bf>(const Packet4bf& a) {
4797 return F32ToBf16(pround<Packet4f>(Bf16ToF32(a)));
4798}
4799
4800template <>
4801EIGEN_STRONG_INLINE Packet4bf ptrunc<Packet4bf>(const Packet4bf& a) {
4802 return F32ToBf16(ptrunc<Packet4f>(Bf16ToF32(a)));
4803}
4804
4805template <>
4806EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
4807 return a;
4808}
4809
4810template <>
4811EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4812 return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4813}
4814
4815template <>
4816EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4817 return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4818}
4819
4820template <>
4821EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4822 return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4823}
4824
4825template <>
4826EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4827 return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4828}
4829
4830template <>
4831EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4832 return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4833}
4834
4835template <>
4836EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4837 return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4838}
4839
4840template <>
4841EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
4842 return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
4843}
4844
4845template <>
4846EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4847 return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4848}
4849
4850template <>
4851EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride) {
4852 return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
4853}
4854
4855template <>
4856EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride) {
4857 pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
4858}
4859
4860template <>
4861EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a) {
4862 return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
4863}
4864
4865template <>
4866EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a) {
4867 return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
4868}
4869
4870template <>
4871EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a) {
4872 return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
4873}
4874
4875template <>
4876EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a) {
4877 return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
4878}
4879
4880template <>
4881EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a) {
4882 return Packet4bf(preverse<Packet4us>(Packet4us(a)));
4883}
4884
4885EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
4886 detail::ptranspose_impl(kernel);
4887}
4888
4889template <>
4890EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4891 return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4892}
4893
4894template <>
4895EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4896 return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4897}
4898
4899template <>
4900EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4901 return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4902}
4903
4904template <>
4905EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4906 return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4907}
4908
4909template <>
4910EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
4911 return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
4912}
4913
4914template <>
4915EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
4916 return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
4917}
4918
4919//---------- double ----------
4920
4921// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrinsics for double.
4922// Confirmed at least with __apple_build_version__ = 6000054.
4923#if EIGEN_COMP_CLANGAPPLE
4924// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
4925// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
4926// major toolchain updates.
4927#define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
4928#else
4929#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
4930#endif
4931
4932#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
4933
4934#if EIGEN_COMP_GNUC
4935// Bug 907: workaround missing declarations of the following two functions in the ADK
4936// Defining these functions as templates ensures that if these intrinsics are
4937// already defined in arm_neon.h, then our workaround doesn't cause a conflict
4938// and has lower priority in overload resolution.
4939// This doesn't work with MSVC though, since the function names are macros.
4940template <typename T>
4941uint64x2_t vreinterpretq_u64_f64(T a) {
4942 return (uint64x2_t)a;
4943}
4944
4945template <typename T>
4946float64x2_t vreinterpretq_f64_u64(T a) {
4947 return (float64x2_t)a;
4948}
4949#endif
4950
4951#if EIGEN_COMP_MSVC_STRICT
4952typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
4953typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
4954
4955EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
4956 double from[2] = {a, b};
4957 return vld1q_f64(from);
4958}
4959
4960#else
4961typedef float64x2_t Packet2d;
4962typedef float64x1_t Packet1d;
4963
4964EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
4965#endif
4966
4967// functionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
4968// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
4969// for fast inversion of matrices of size 4.
4970EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
4971 const double* a = reinterpret_cast<const double*>(&m);
4972 const double* b = reinterpret_cast<const double*>(&n);
4973 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
4974 return res;
4975}
4976
4977EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
4978 return shuffle(a, b, mask);
4979}
4980EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
4981EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
4982#define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
4983
4984template <>
4985struct packet_traits<double> : default_packet_traits {
4986 typedef Packet2d type;
4987 typedef Packet2d half;
4988 enum {
4989 Vectorizable = 1,
4990 AlignedOnScalar = 1,
4991 size = 2,
4992
4993 HasCmp = 1,
4994 HasAdd = 1,
4995 HasSub = 1,
4996 HasShift = 1,
4997 HasMul = 1,
4998 HasNegate = 1,
4999 HasAbs = 1,
5000 HasArg = 0,
5001 HasAbsDiff = 1,
5002 HasMin = 1,
5003 HasMax = 1,
5004 HasConj = 1,
5005 HasSetLinear = 1,
5006
5007 HasDiv = 1,
5008
5009#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5010 HasExp = 1,
5011 HasLog = 1,
5012 HasLog1p = 1,
5013 HasExpm1 = 1,
5014 HasPow = 1,
5015 HasATan = 1,
5016 HasATanh = 1,
5017#endif
5018 HasSin = EIGEN_FAST_MATH,
5019 HasCos = EIGEN_FAST_MATH,
5020 HasSqrt = 1,
5021 HasRsqrt = 1,
5022 HasCbrt = 1,
5023 HasTanh = EIGEN_FAST_MATH,
5024 HasErf = EIGEN_FAST_MATH,
5025 HasErfc = EIGEN_FAST_MATH
5026 };
5027};
5028
5029template <>
5030struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
5031 using integer_packet = Packet2l;
5032};
5033
5034template <>
5035EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(const Packet2d& /*a*/) {
5036 return vdupq_n_f64(0.0);
5037}
5038
5039template <>
5040EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
5041 return vdupq_n_f64(from);
5042}
5043
5044template <>
5045EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
5046 const double c[] = {0.0, 1.0};
5047 return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
5048}
5049
5050template <>
5051EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
5052 return vaddq_f64(a, b);
5053}
5054
5055template <>
5056EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
5057 return vsubq_f64(a, b);
5058}
5059
5060template <>
5061EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
5062template <>
5063EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
5064 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
5065 return padd(a, pxor(mask, b));
5066}
5067
5068template <>
5069EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
5070 return vnegq_f64(a);
5071}
5072
5073template <>
5074EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
5075 return a;
5076}
5077
5078template <>
5079EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
5080 return vmulq_f64(a, b);
5081}
5082
5083template <>
5084EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
5085 return vdivq_f64(a, b);
5086}
5087
5088#ifdef EIGEN_VECTORIZE_FMA
5089// See bug 936. See above comment about FMA for float.
5090template <>
5091EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5092 return vfmaq_f64(c, a, b);
5093}
5094template <>
5095EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5096 return vfmsq_f64(c, a, b);
5097}
5098#else
5099template <>
5100EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5101 return vmlaq_f64(c, a, b);
5102}
5103template <>
5104EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5105 return vmlsq_f64(c, a, b);
5106}
5107#endif
5108template <>
5109EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5110 return pnegate(pnmadd(a, b, c));
5111}
5112template <>
5113EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
5114 return pnegate(pmadd(a, b, c));
5115}
5116template <>
5117EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
5118 return vminq_f64(a, b);
5119}
5120
5121#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5122// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5123// systems).
5124template <>
5125EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
5126 return vminnmq_f64(a, b);
5127}
5128template <>
5129EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
5130 return vmaxnmq_f64(a, b);
5131}
5132
5133#endif
5134
5135template <>
5136EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
5137 return pmin<Packet2d>(a, b);
5138}
5139
5140template <>
5141EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
5142 return vmaxq_f64(a, b);
5143}
5144
5145template <>
5146EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
5147 return pmax<Packet2d>(a, b);
5148}
5149
5150// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
5151template <>
5152EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
5153 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5154}
5155
5156template <>
5157EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
5158 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5159}
5160
5161template <>
5162EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
5163 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5164}
5165
5166template <>
5167EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
5168 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
5169}
5170
5171template <>
5172EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
5173 return vreinterpretq_f64_u64(vcleq_f64(a, b));
5174}
5175
5176template <>
5177EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
5178 return vreinterpretq_f64_u64(vcltq_f64(a, b));
5179}
5180
5181template <>
5182EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
5183 return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
5184}
5185
5186template <>
5187EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
5188 return vreinterpretq_f64_u64(vceqq_f64(a, b));
5189}
5190
5191template <>
5192EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
5193 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(from));
5194}
5195
5196template <>
5197EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
5198 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
5199}
5200
5201template <>
5202EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
5203 return vld1q_dup_f64(from);
5204}
5205template <>
5206EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
5207 EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(to), from);
5208}
5209
5210template <>
5211EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
5212 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
5213}
5214
5215template <>
5216EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
5217 Packet2d res = pset1<Packet2d>(0.0);
5218 res = vld1q_lane_f64(from + 0 * stride, res, 0);
5219 res = vld1q_lane_f64(from + 1 * stride, res, 1);
5220 return res;
5221}
5222
5223template <>
5224EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
5225 vst1q_lane_f64(to + stride * 0, from, 0);
5226 vst1q_lane_f64(to + stride * 1, from, 1);
5227}
5228
5229template <>
5230EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
5231 EIGEN_ARM_PREFETCH(addr);
5232}
5233
5234// FIXME only store the 2 first elements ?
5235template <>
5236EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
5237 return vgetq_lane_f64(a, 0);
5238}
5239
5240template <>
5241EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
5242 return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
5243}
5244
5245template <>
5246EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
5247 return vabsq_f64(a);
5248}
5249
5250template <>
5251EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
5252 return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
5253}
5254
5255template <>
5256EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
5257 return vaddvq_f64(a);
5258}
5259
5260// Other reduction functions:
5261// mul
5262#if EIGEN_COMP_CLANGAPPLE
5263template <>
5264EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
5265 return (vget_low_f64(a) * vget_high_f64(a))[0];
5266}
5267#else
5268template <>
5269EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
5270 return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
5271}
5272#endif
5273
5274// min
5275template <>
5276EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
5277 return vminvq_f64(a);
5278}
5279
5280// max
5281template <>
5282EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
5283 return vmaxvq_f64(a);
5284}
5285
5286EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
5287 const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
5288 const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
5289
5290 kernel.packet[0] = tmp1;
5291 kernel.packet[1] = tmp2;
5292}
5293
5294template <>
5295EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
5296 return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
5297}
5298
5299template <>
5300EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
5301 return vrndnq_f64(a);
5302}
5303
5304template <>
5305EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
5306 return vrndmq_f64(a);
5307}
5308
5309template <>
5310EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
5311 return vrndpq_f64(a);
5312}
5313
5314template <>
5315EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
5316 return vrndaq_f64(a);
5317}
5318
5319template <>
5320EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
5321 return vrndq_f64(a);
5322}
5323
5324template <>
5325EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
5326 return pldexp_generic(a, exponent);
5327}
5328
5329template <>
5330EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
5331 return pfrexp_generic(a, exponent);
5332}
5333
5334template <>
5335EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
5336 return vreinterpretq_f64_u64(vdupq_n_u64(from));
5337}
5338
5339template <>
5340EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
5341 // Do Newton iterations for 1/sqrt(x).
5342 return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
5343}
5344
5345template <>
5346EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x) {
5347 return vsqrtq_f64(_x);
5348}
5349
5350#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
5351
5352// Do we have an fp16 types and supporting Neon intrinsics?
5353#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
5354typedef float16x4_t Packet4hf;
5355typedef float16x8_t Packet8hf;
5356
5357template <>
5358struct packet_traits<Eigen::half> : default_packet_traits {
5359 typedef Packet8hf type;
5360 typedef Packet4hf half;
5361 enum {
5362 Vectorizable = 1,
5363 AlignedOnScalar = 1,
5364 size = 8,
5365
5366 HasCmp = 1,
5367 HasCast = 1,
5368 HasAdd = 1,
5369 HasSub = 1,
5370 HasShift = 1,
5371 HasMul = 1,
5372 HasNegate = 1,
5373 HasAbs = 1,
5374 HasArg = 0,
5375 HasAbsDiff = 0,
5376 HasMin = 1,
5377 HasMax = 1,
5378 HasConj = 1,
5379 HasSetLinear = 1,
5380 HasInsert = 1,
5381 HasReduxp = 1,
5382 HasDiv = 1,
5383 HasSin = 0,
5384 HasCos = 0,
5385 HasLog = 0,
5386 HasExp = 0,
5387 HasTanh = packet_traits<float>::HasTanh, // tanh<half> calls tanh<float>
5388 HasSqrt = 1,
5389 HasRsqrt = 1,
5390 HasErf = EIGEN_FAST_MATH,
5391 HasBessel = 0, // Issues with accuracy.
5392 HasNdtri = 0
5393 };
5394};
5395
5396template <>
5397struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
5398template <>
5399struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
5400 using half = Packet4hf;
5401};
5402
5403template <>
5404EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
5405 return vadd_f16(vget_low_f16(a), vget_high_f16(a));
5406}
5407
5408template <>
5409EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
5410 return vdupq_n_f16(from.x);
5411}
5412
5413template <>
5414EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
5415 return vdup_n_f16(from.x);
5416}
5417
5418template <>
5419EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
5420 const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
5421 Packet8hf countdown = vld1q_f16(f);
5422 return vaddq_f16(pset1<Packet8hf>(a), countdown);
5423}
5424
5425template <>
5426EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
5427 const float16_t f[] = {0, 1, 2, 3};
5428 Packet4hf countdown = vld1_f16(f);
5429 return vadd_f16(pset1<Packet4hf>(a), countdown);
5430}
5431
5432template <>
5433EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5434 return vaddq_f16(a, b);
5435}
5436
5437template <>
5438EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5439 return vadd_f16(a, b);
5440}
5441
5442template <>
5443EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5444 return vsubq_f16(a, b);
5445}
5446
5447template <>
5448EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5449 return vsub_f16(a, b);
5450}
5451
5452template <>
5453EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
5454 return vnegq_f16(a);
5455}
5456
5457template <>
5458EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
5459 return vneg_f16(a);
5460}
5461
5462template <>
5463EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
5464 return a;
5465}
5466
5467template <>
5468EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
5469 return a;
5470}
5471
5472template <>
5473EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5474 return vmulq_f16(a, b);
5475}
5476
5477template <>
5478EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5479 return vmul_f16(a, b);
5480}
5481
5482template <>
5483EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5484 return vdivq_f16(a, b);
5485}
5486
5487template <>
5488EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5489 return vdiv_f16(a, b);
5490}
5491
5492template <>
5493EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5494 return vfmaq_f16(c, a, b);
5495}
5496
5497template <>
5498EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5499 return vfma_f16(c, a, b);
5500}
5501
5502template <>
5503EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5504 return vfmsq_f16(c, a, b);
5505}
5506
5507template <>
5508EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5509 return vfms_f16(c, a, b);
5510}
5511
5512template <>
5513EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5514 return pnegate(pnmadd(a, b, c));
5515}
5516
5517template <>
5518EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5519 return pnegate(pnmadd(a, b, c));
5520}
5521
5522template <>
5523EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
5524 return pnegate(pmadd(a, b, c));
5525}
5526
5527template <>
5528EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
5529 return pnegate(pmadd(a, b, c));
5530}
5531
5532template <>
5533EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5534 return vminq_f16(a, b);
5535}
5536
5537template <>
5538EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5539 return vmin_f16(a, b);
5540}
5541
5542#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5543// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5544// systems).
5545template <>
5546EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5547 return vminnm_f16(a, b);
5548}
5549template <>
5550EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5551 return vminnmq_f16(a, b);
5552}
5553#endif
5554
5555template <>
5556EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5557 return pmin<Packet4hf>(a, b);
5558}
5559
5560template <>
5561EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5562 return pmin<Packet8hf>(a, b);
5563}
5564
5565template <>
5566EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5567 return vmaxq_f16(a, b);
5568}
5569
5570template <>
5571EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5572 return vmax_f16(a, b);
5573}
5574
5575#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
5576// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
5577// systems).
5578template <>
5579EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5580 return vmaxnm_f16(a, b);
5581}
5582template <>
5583EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5584 return vmaxnmq_f16(a, b);
5585}
5586#endif
5587
5588template <>
5589EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5590 return pmax<Packet4hf>(a, b);
5591}
5592
5593template <>
5594EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5595 return pmax<Packet8hf>(a, b);
5596}
5597
5598#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
5599 template <> \
5600 EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
5601 return vreinterpretq_f16_u16(vc##name##q_f16(a, b)); \
5602 }
5603
5604#define EIGEN_MAKE_ARM_FP16_CMP_4(name) \
5605 template <> \
5606 EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
5607 return vreinterpret_f16_u16(vc##name##_f16(a, b)); \
5608 }
5609
5610EIGEN_MAKE_ARM_FP16_CMP_8(eq)
5611EIGEN_MAKE_ARM_FP16_CMP_8(lt)
5612EIGEN_MAKE_ARM_FP16_CMP_8(le)
5613
5614EIGEN_MAKE_ARM_FP16_CMP_4(eq)
5615EIGEN_MAKE_ARM_FP16_CMP_4(lt)
5616EIGEN_MAKE_ARM_FP16_CMP_4(le)
5617
5618#undef EIGEN_MAKE_ARM_FP16_CMP_8
5619#undef EIGEN_MAKE_ARM_FP16_CMP_4
5620
5621template <>
5622EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5623 return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
5624}
5625
5626template <>
5627EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5628 return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
5629}
5630
5631template <>
5632EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
5633 return vrndnq_f16(a);
5634}
5635
5636template <>
5637EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
5638 return vrndn_f16(a);
5639}
5640
5641template <>
5642EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
5643 return vrndmq_f16(a);
5644}
5645
5646template <>
5647EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
5648 return vrndm_f16(a);
5649}
5650
5651template <>
5652EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
5653 return vrndpq_f16(a);
5654}
5655
5656template <>
5657EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
5658 return vrndp_f16(a);
5659}
5660
5661template <>
5662EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(const Packet8hf& a) {
5663 return vrndaq_f16(a);
5664}
5665
5666template <>
5667EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(const Packet4hf& a) {
5668 return vrnda_f16(a);
5669}
5670
5671template <>
5672EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(const Packet8hf& a) {
5673 return vrndq_f16(a);
5674}
5675
5676template <>
5677EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(const Packet4hf& a) {
5678 return vrnd_f16(a);
5679}
5680
5681template <>
5682EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
5683 return vsqrtq_f16(a);
5684}
5685
5686template <>
5687EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
5688 return vsqrt_f16(a);
5689}
5690
5691template <>
5692EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5693 return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5694}
5695
5696template <>
5697EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5698 return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5699}
5700
5701template <>
5702EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5703 return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5704}
5705
5706template <>
5707EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5708 return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5709}
5710
5711template <>
5712EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5713 return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5714}
5715
5716template <>
5717EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5718 return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5719}
5720
5721template <>
5722EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
5723 return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
5724}
5725
5726template <>
5727EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
5728 return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
5729}
5730
5731template <>
5732EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
5733 EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(
5734 reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(from)));
5735}
5736
5737template <>
5738EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
5739 EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(
5740 reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(from)));
5741}
5742
5743template <>
5744EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
5745 EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
5746}
5747
5748template <>
5749EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
5750 EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
5751}
5752
5753template <>
5754EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
5755 Packet8hf packet;
5756 packet[0] = from[0].x;
5757 packet[1] = from[0].x;
5758 packet[2] = from[1].x;
5759 packet[3] = from[1].x;
5760 packet[4] = from[2].x;
5761 packet[5] = from[2].x;
5762 packet[6] = from[3].x;
5763 packet[7] = from[3].x;
5764 return packet;
5765}
5766
5767template <>
5768EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
5769 float16x4_t packet;
5770 float16_t* tmp;
5771 tmp = (float16_t*)&packet;
5772 tmp[0] = from[0].x;
5773 tmp[1] = from[0].x;
5774 tmp[2] = from[1].x;
5775 tmp[3] = from[1].x;
5776 return packet;
5777}
5778
5779template <>
5780EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
5781 Packet4hf lo, hi;
5782 lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
5783 hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
5784 return vcombine_f16(lo, hi);
5785}
5786
5787EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
5788 return vsetq_lane_f16(b.x, a, 0);
5789}
5790
5791EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
5792 return vset_lane_f16(b.x, a, 0);
5793}
5794
5795template <>
5796EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
5797 return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
5798}
5799
5800template <>
5801EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
5802 return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
5803}
5804
5805EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
5806 return vsetq_lane_f16(b.x, a, 7);
5807}
5808
5809EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
5810 return vset_lane_f16(b.x, a, 3);
5811}
5812
5813template <>
5814EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5815 EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(
5816 reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(to)), from);
5817}
5818
5819template <>
5820EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5821 EIGEN_DEBUG_ALIGNED_STORE vst1_f16(
5822 reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(to)), from);
5823}
5824
5825template <>
5826EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
5827 EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
5828}
5829
5830template <>
5831EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
5832 EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
5833}
5834
5835template <>
5836EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
5837 Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
5838 res = vsetq_lane_f16(from[0 * stride].x, res, 0);
5839 res = vsetq_lane_f16(from[1 * stride].x, res, 1);
5840 res = vsetq_lane_f16(from[2 * stride].x, res, 2);
5841 res = vsetq_lane_f16(from[3 * stride].x, res, 3);
5842 res = vsetq_lane_f16(from[4 * stride].x, res, 4);
5843 res = vsetq_lane_f16(from[5 * stride].x, res, 5);
5844 res = vsetq_lane_f16(from[6 * stride].x, res, 6);
5845 res = vsetq_lane_f16(from[7 * stride].x, res, 7);
5846 return res;
5847}
5848
5849template <>
5850EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
5851 Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
5852 res = vset_lane_f16(from[0 * stride].x, res, 0);
5853 res = vset_lane_f16(from[1 * stride].x, res, 1);
5854 res = vset_lane_f16(from[2 * stride].x, res, 2);
5855 res = vset_lane_f16(from[3 * stride].x, res, 3);
5856 return res;
5857}
5858
5859template <>
5860EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
5861 Index stride) {
5862 to[stride * 0].x = vgetq_lane_f16(from, 0);
5863 to[stride * 1].x = vgetq_lane_f16(from, 1);
5864 to[stride * 2].x = vgetq_lane_f16(from, 2);
5865 to[stride * 3].x = vgetq_lane_f16(from, 3);
5866 to[stride * 4].x = vgetq_lane_f16(from, 4);
5867 to[stride * 5].x = vgetq_lane_f16(from, 5);
5868 to[stride * 6].x = vgetq_lane_f16(from, 6);
5869 to[stride * 7].x = vgetq_lane_f16(from, 7);
5870}
5871
5872template <>
5873EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
5874 Index stride) {
5875 to[stride * 0].x = vget_lane_f16(from, 0);
5876 to[stride * 1].x = vget_lane_f16(from, 1);
5877 to[stride * 2].x = vget_lane_f16(from, 2);
5878 to[stride * 3].x = vget_lane_f16(from, 3);
5879}
5880
5881template <>
5882EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
5883 EIGEN_ARM_PREFETCH(addr);
5884}
5885
5886template <>
5887EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
5888 float16_t x[8];
5889 vst1q_f16(x, a);
5890 Eigen::half h;
5891 h.x = x[0];
5892 return h;
5893}
5894
5895template <>
5896EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
5897 float16_t x[4];
5898 vst1_f16(x, a);
5899 Eigen::half h;
5900 h.x = x[0];
5901 return h;
5902}
5903
5904template <>
5905EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
5906 float16x4_t a_lo, a_hi;
5907 Packet8hf a_r64;
5908
5909 a_r64 = vrev64q_f16(a);
5910 a_lo = vget_low_f16(a_r64);
5911 a_hi = vget_high_f16(a_r64);
5912 return vcombine_f16(a_hi, a_lo);
5913}
5914
5915template <>
5916EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
5917 return vrev64_f16(a);
5918}
5919
5920template <>
5921EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
5922 return vabsq_f16(a);
5923}
5924
5925template <>
5926EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
5927 return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
5928}
5929
5930template <>
5931EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
5932 return vabs_f16(a);
5933}
5934
5935template <>
5936EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
5937 return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
5938}
5939
5940template <>
5941EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
5942 float16x4_t a_lo, a_hi, sum;
5943
5944 a_lo = vget_low_f16(a);
5945 a_hi = vget_high_f16(a);
5946 sum = vpadd_f16(a_lo, a_hi);
5947 sum = vpadd_f16(sum, sum);
5948 sum = vpadd_f16(sum, sum);
5949
5950 Eigen::half h;
5951 h.x = vget_lane_f16(sum, 0);
5952 return h;
5953}
5954
5955template <>
5956EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
5957 float16x4_t sum;
5958
5959 sum = vpadd_f16(a, a);
5960 sum = vpadd_f16(sum, sum);
5961 Eigen::half h;
5962 h.x = vget_lane_f16(sum, 0);
5963 return h;
5964}
5965
5966template <>
5967EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
5968 float16x4_t a_lo, a_hi, prod;
5969
5970 a_lo = vget_low_f16(a);
5971 a_hi = vget_high_f16(a);
5972 prod = vmul_f16(a_lo, a_hi);
5973 prod = vmul_f16(prod, vrev64_f16(prod));
5974
5975 Eigen::half h;
5976 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
5977 return h;
5978}
5979
5980template <>
5981EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
5982 float16x4_t prod;
5983 prod = vmul_f16(a, vrev64_f16(a));
5984 Eigen::half h;
5985 h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
5986 return h;
5987}
5988
5989template <>
5990EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
5991 Eigen::half h;
5992 h.x = vminvq_f16(a);
5993 return h;
5994}
5995
5996template <>
5997EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
5998 Eigen::half h;
5999 h.x = vminv_f16(a);
6000 return h;
6001}
6002
6003template <>
6004EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
6005 Eigen::half h;
6006 h.x = vmaxvq_f16(a);
6007 return h;
6008}
6009
6010template <>
6011EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
6012 Eigen::half h;
6013 h.x = vmaxv_f16(a);
6014 return h;
6015}
6016
6017EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
6018 const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
6019 const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
6020
6021 const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
6022 const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
6023
6024 kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
6025 kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
6026 kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
6027 kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
6028}
6029
6030EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
6031 EIGEN_ALIGN16 float16x4x4_t tmp_x4;
6032 float16_t* tmp = (float16_t*)&kernel;
6033 tmp_x4 = vld4_f16(tmp);
6034
6035 kernel.packet[0] = tmp_x4.val[0];
6036 kernel.packet[1] = tmp_x4.val[1];
6037 kernel.packet[2] = tmp_x4.val[2];
6038 kernel.packet[3] = tmp_x4.val[3];
6039}
6040
6041EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
6042 float16x8x2_t T_1[4];
6043
6044 T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
6045 T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
6046 T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
6047 T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
6048
6049 float16x8x2_t T_2[4];
6050 T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
6051 T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
6052 T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
6053 T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
6054
6055 float16x8x2_t T_3[4];
6056 T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
6057 T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
6058 T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
6059 T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
6060
6061 kernel.packet[0] = T_3[0].val[0];
6062 kernel.packet[1] = T_3[2].val[0];
6063 kernel.packet[2] = T_3[1].val[0];
6064 kernel.packet[3] = T_3[3].val[0];
6065 kernel.packet[4] = T_3[0].val[1];
6066 kernel.packet[5] = T_3[2].val[1];
6067 kernel.packet[6] = T_3[1].val[1];
6068 kernel.packet[7] = T_3[3].val[1];
6069}
6070#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
6071
6072} // end namespace internal
6073
6074} // end namespace Eigen
6075
6076#endif // EIGEN_PACKET_MATH_NEON_H
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82